Project Description

These samples are basically an addition to the one sample in the paper that was analysed previously. They are cells from patients with ovarian cancer, we isolated the tumour and stromal cells for each patient. And have mixed them back together to give 1 sample per patient.

So the only difference from the previous sample is that these have the tumour and stromals from the same patient mixed together in a 75%:25% ratio

Aim

The aims were

1. To cluster them establish the 2 populations
2. To establish the over-dispersed genes in the two populations

From the first sample p53 and XIST were differentially expressed so you could use these to validate the initial clustering. The analysis after that would be the same as the previous sample

Data preperation

Loading library

library(Seurat)
library(scater)
library(ggplot2)
library(scran)
library(GGally)
library(mclust)
library(Rtsne)
library(viridis)
#library(umapr)
library(umap)
library(tidyverse)
library(paletteer)

Setting up color palettes. I was using paletter package as it was giving a colour blind palette. But now I am using the palette that Matt gave from Conor. This gives a useful clustering for Matt

Setting up colorblind friendly color palette:

#cbPalette <- paletteer_d(package = "ggthemes", palette="calc", n=12)
 c30 <- c("dodgerblue2",#1
         "#E31A1C", #2 red
          "green4", #3
         "#FF7F00", #4 orange
           "green1",#5
         "purple",#6
         "blue1",#7
         "deeppink1",#8
         "darkorange4",#9
          "black",#10
         "gold1",#11
         "darkturquoise",#12
         "#6A3D9A", #13 purple
         "orchid1",#14
         "gray70",#15
          "maroon",#16
         "palegreen2",#17
          "#333333",#18
          "#CAB2D6", #19 lt purple
          "#FDBF6F", #20 lt orange
         "khaki2",#21
         "skyblue2",#22
         "steelblue4",#23
         "green1",#24
         "yellow4",#25
         "yellow3",#26
         "#FB9A99", #27 lt pink
         "brown",#28
         "#000099",#29
         "#CC3300"#30
         )
 
pie(rep(1,30), col=c30)

Choosing for the samples

c_sample_col <- c30[c(3,22,19,30)]

Choosing for the clusters

c_clust_col <- c30[c(1,2,3,4,5,6,7,8,11,14,22,24)]

__ This data set was not normalized using 10X downsampling as the samples have large variability in their cell numbers.

Importing cell-ranger data

cellranger_pipestance_path <- "/home/ubuntu/Rna-seq_Data-Analysis/Louisa_Nelson_10X_Analysis/LN_aggr/outs/filtered_feature_bc_matrix/"
ovarianCancer <- Read10X(cellranger_pipestance_path)
#analysis_results <- load_cellranger_analysis_results(cellranger_pipestance_path)
ovarianCancer
33538 x 20717 sparse Matrix of class "dgCMatrix"
   [[ suppressing 73 column names ‘AAACCCACAGTTAGGG-1’, ‘AAACCCACATGTGTCA-1’, ‘AAACCCAGTCGCATGC-1’ ... ]]
   [[ suppressing 73 column names ‘AAACCCACAGTTAGGG-1’, ‘AAACCCACATGTGTCA-1’, ‘AAACCCAGTCGCATGC-1’ ... ]]
                                                                                                                                                                    
MIR1302-2HG . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
FAM138A     . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
OR4F5       . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL627309.1  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL627309.3  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL627309.2  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AL627309.4  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......

 ..............................
 ........suppressing 20644 columns and 33525 rows in show(); maybe adjust 'options(max.print= *, width = *)'
 ..............................
   [[ suppressing 73 column names ‘AAACCCACAGTTAGGG-1’, ‘AAACCCACATGTGTCA-1’, ‘AAACCCAGTCGCATGC-1’ ... ]]
                                                                                                                                                                   
AC004556.1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AC233755.2 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AC233755.1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
AC240274.1 1 . . . . . . . . . . . 1 . . . . . . . . . . . 1 . . . . 1 . . . . . . . . . . . . . . . . . . . . . 1 1 . . . . . . . . . . . . . . . . . . . . ......
AC213203.1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......
FAM231C    . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ......

Extracting cell information

Sample1_barcodes  <- data.frame(barcode=colnames(ovarianCancer)[grep("1", colnames(ovarianCancer))], Sample="Patient1")
Sample2_barcodes  <- data.frame(barcode=colnames(ovarianCancer)[grep("2", colnames(ovarianCancer))], Sample="Patient2")
Sample3_barcodes  <- data.frame(barcode=colnames(ovarianCancer)[grep("3", colnames(ovarianCancer))], Sample="Patient3")
Sample4_barcodes  <- data.frame(barcode=colnames(ovarianCancer)[grep("4", colnames(ovarianCancer))], Sample="Patient4")
print(paste0('Number of Sample1 cells: ',dim(Sample1_barcodes)[1]))
[1] "Number of Sample1 cells: 5756"
print(paste0('Number of Sample2 cells: ',dim(Sample2_barcodes)[1]))
[1] "Number of Sample2 cells: 3974"
print(paste0('Number of Sample3 cells: ',dim(Sample3_barcodes)[1]))
[1] "Number of Sample3 cells: 6433"
print(paste0('Number of Sample4 cells: ',dim(Sample4_barcodes)[1]))
[1] "Number of Sample4 cells: 4554"

Confirmed this number with the QC output produced by cell ranger.

annotBarcode <- rbind(Sample1_barcodes, Sample2_barcodes, Sample3_barcodes, Sample4_barcodes)
rownames(annotBarcode) <- annotBarcode$barcode
head(annotBarcode)

Creating the SingleCellExperiment object using scater. We would be using the exprs function of gbm. Later on scater might add its own function to input 10X data, but before that we will be using this method

cdSc <- SingleCellExperiment(assays = list(counts = as.matrix(ovarianCancer)))
colData(cdSc)$barcode <- annotBarcode$barcode
colData(cdSc)$Sample <- annotBarcode$Sample
rowData(cdSc)$symbol <- rownames(ovarianCancer)
cdSc <- scater::calculateQCMetrics(cdSc)
cdSc
class: SingleCellExperiment 
dim: 33538 20717 
metadata(0):
assays(1): counts
rownames(33538): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
rowData names(8): symbol is_feature_control ... total_counts log10_total_counts
colnames(20717): AAACCCACAGTTAGGG-1 AAACCCACATGTGTCA-1 ... TTTGTTGGTCCTGGTG-4 TTTGTTGTCAGATTGC-4
colData names(11): barcode Sample ... pct_counts_in_top_200_features pct_counts_in_top_500_features
reducedDimNames(0):
spikeNames(0):

Here we use log2-counts-per-million with an offset of 1 as the exprs values. We have to note that the CPM for scater is different than the regular CPM calculation as we are considering the step-size here.

The value of the log-CPMs is explained by adding a prior count to avoid undefined values after the log-transformation, multiplying by a million, and dividing by the mean library size. This size factors are used to define the effective library sizes. This is done by scaling all size factors such that the mean scaled size factor is equal to the mean sum of counts across all features. The effective library sizes are then used to in the denominator of the CPM calculation. The way that scater calculates the log-normalized counts are

lib.sizes <- colSums(counts(example_sceset))
lib.sizes <- lib.sizes/mean(lib.sizes)
log2(counts(example_sceset)[1,]/lib.sizes+1)

We now normalize for all the counts

exprs(cdSc) <- log2(scater::calculateCPM(cdSc, use_size_factors = TRUE) + 1)

Saving Dataset

save.image()
# save.image('PC_20181002.RData')

QC analysis

At the start of the QC analysis we make different plots to visualize the summary statistics.

Cell QC

Low-quality cells need to be identified and removed to ensure that the technical effects do not distort downstream analysis results. Two common measures of cell quality are the library size and the number of expressed genes in each library. The library size is defined as the total sum of counts across all genes. Cells with relatively small library sizes are considered to be of low quality as the RNA has not been efficiently captured (i.e., converted into cDNA and amplified) during library preparation. The number of expressed genes in each cell is defined as the number of genes with non-zero counts for that cell. Any cell with very few expressed genes is likely to be of poor quality as the diverse transcript population has not been successfully captured.

Reads mapping with mitochondrial reads

We identify the rows corresponding to mitochondrial genes. Another important measure of quality is the proportion of reads mapped to genes in the mitochondrial genome. High proportions are indicative of poor-quality cells (Ilicic et al., 2016; Islam et al., 2014), possibly because of increased apoptosis and/or loss of cytoplasmic RNA from lysed cells.

Reads mapping with mitochondrial reads

We identify the rows corresponding to mitochondrial genes.

library(biomaRt)
my.ids <- gsub('\\..*','',rownames(cdSc))
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
chrName <- getBM(attributes=c('ensembl_gene_id','hgnc_symbol','chromosome_name'), filters = 'hgnc_symbol', values =my.ids, mart = ensembl)

Batch submitting query [==>--------------------------------------------------------------------------------------------------------------]   3% eta: 10s
Batch submitting query [====>------------------------------------------------------------------------------------------------------------]   4% eta: 12s
Batch submitting query [======>----------------------------------------------------------------------------------------------------------]   6% eta: 13s
Batch submitting query [=======>---------------------------------------------------------------------------------------------------------]   7% eta: 13s
Batch submitting query [=========>-------------------------------------------------------------------------------------------------------]   9% eta: 12s
Batch submitting query [===========>-----------------------------------------------------------------------------------------------------]  10% eta: 13s
Batch submitting query [============>----------------------------------------------------------------------------------------------------]  12% eta: 13s
Batch submitting query [==============>--------------------------------------------------------------------------------------------------]  13% eta: 13s
Batch submitting query [================>------------------------------------------------------------------------------------------------]  15% eta: 13s
Batch submitting query [=================>-----------------------------------------------------------------------------------------------]  16% eta: 12s
Batch submitting query [===================>---------------------------------------------------------------------------------------------]  18% eta:  1m
Batch submitting query [=====================>-------------------------------------------------------------------------------------------]  19% eta:  1m
Batch submitting query [======================>------------------------------------------------------------------------------------------]  21% eta: 50s
Batch submitting query [========================>----------------------------------------------------------------------------------------]  22% eta: 46s
Batch submitting query [==========================>--------------------------------------------------------------------------------------]  24% eta: 43s
Batch submitting query [===========================>-------------------------------------------------------------------------------------]  25% eta: 41s
Batch submitting query [=============================>-----------------------------------------------------------------------------------]  26% eta: 43s
Batch submitting query [===============================>---------------------------------------------------------------------------------]  28% eta: 40s
Batch submitting query [================================>--------------------------------------------------------------------------------]  29% eta: 38s
Batch submitting query [==================================>------------------------------------------------------------------------------]  31% eta: 36s
Batch submitting query [====================================>----------------------------------------------------------------------------]  32% eta: 34s
Batch submitting query [=====================================>---------------------------------------------------------------------------]  34% eta: 32s
Batch submitting query [=======================================>-------------------------------------------------------------------------]  35% eta: 32s
Batch submitting query [=========================================>-----------------------------------------------------------------------]  37% eta: 31s
Batch submitting query [==========================================>----------------------------------------------------------------------]  38% eta: 30s
Batch submitting query [============================================>--------------------------------------------------------------------]  40% eta: 37s
Batch submitting query [==============================================>------------------------------------------------------------------]  41% eta: 35s
Batch submitting query [===============================================>-----------------------------------------------------------------]  43% eta: 37s
Batch submitting query [=================================================>---------------------------------------------------------------]  44% eta: 35s
Batch submitting query [===================================================>-------------------------------------------------------------]  46% eta: 33s
Batch submitting query [====================================================>------------------------------------------------------------]  47% eta: 32s
Batch submitting query [======================================================>----------------------------------------------------------]  49% eta: 33s
Batch submitting query [=======================================================>---------------------------------------------------------]  50% eta: 32s
Batch submitting query [=========================================================>-------------------------------------------------------]  51% eta: 34s
Batch submitting query [===========================================================>-----------------------------------------------------]  53% eta: 33s
Batch submitting query [============================================================>----------------------------------------------------]  54% eta: 31s
Batch submitting query [==============================================================>--------------------------------------------------]  56% eta: 31s
Batch submitting query [================================================================>------------------------------------------------]  57% eta: 30s
Batch submitting query [=================================================================>-----------------------------------------------]  59% eta: 28s
Batch submitting query [===================================================================>---------------------------------------------]  60% eta: 27s
Batch submitting query [=====================================================================>-------------------------------------------]  62% eta: 25s
Batch submitting query [======================================================================>------------------------------------------]  63% eta: 24s
Batch submitting query [========================================================================>----------------------------------------]  65% eta: 22s
Batch submitting query [==========================================================================>--------------------------------------]  66% eta: 21s
Batch submitting query [===========================================================================>-------------------------------------]  68% eta: 20s
Batch submitting query [=============================================================================>-----------------------------------]  69% eta: 19s
Batch submitting query [===============================================================================>---------------------------------]  71% eta: 19s
Batch submitting query [================================================================================>--------------------------------]  72% eta: 18s
Batch submitting query [==================================================================================>------------------------------]  74% eta: 17s
Batch submitting query [====================================================================================>----------------------------]  75% eta: 15s
Batch submitting query [=====================================================================================>---------------------------]  76% eta: 15s
Batch submitting query [=======================================================================================>-------------------------]  78% eta: 15s
Batch submitting query [=========================================================================================>-----------------------]  79% eta: 13s
Batch submitting query [==========================================================================================>----------------------]  81% eta: 12s
Batch submitting query [============================================================================================>--------------------]  82% eta: 11s
Batch submitting query [==============================================================================================>------------------]  84% eta: 10s
Batch submitting query [===============================================================================================>-----------------]  85% eta:  9s
Batch submitting query [=================================================================================================>---------------]  87% eta:  8s
Batch submitting query [===================================================================================================>-------------]  88% eta:  7s
Batch submitting query [====================================================================================================>------------]  90% eta:  6s
Batch submitting query [======================================================================================================>----------]  91% eta:  5s
Batch submitting query [========================================================================================================>--------]  93% eta:  4s
Batch submitting query [=========================================================================================================>-------]  94% eta:  3s
Batch submitting query [===========================================================================================================>-----]  96% eta:  3s
Batch submitting query [=============================================================================================================>---]  97% eta:  2s
Batch submitting query [==============================================================================================================>--]  99% eta:  1s
Batch submitting query [=================================================================================================================] 100% eta:  0s
                                                                                                                                                        
chrName <- chrName[match(my.ids, chrName$hgnc_symbol),]
is.mito <- chrName$chromosome_name == "MT" & !is.na(chrName$chromosome_name)
sum(is.mito)
[1] 13
cdSc <- calculateQCMetrics(cdSc, feature_controls=list(Mt=is.mito))

Plotting the histograms of my QC entries.

par(mfrow=c(2,2))
hist((cdSc$total_counts)/1e6, xlab="Library sizes (millions)", main="Histogram of Library size",
     breaks=100, col="grey80", ylab="Number of cells")
hist(cdSc$total_features_by_counts, xlab="Number of expressed genes", main="Histogram of No. of Features",
     breaks=100, col="grey80", ylab="Number of cells")
hist(cdSc$pct_counts_Mt, xlab="Mitochondrial proportion (%)",
     ylab="Number of cells", breaks=100, main="Histogram of Mictochondria %", col="grey80")

The summary statistics for mitochondrial read proportion is reasonably good. As anything below 10% is very good and here the mean is around 7%. However, there are some outlier cells expressing very high mitocondrial genes. I am going to filter those genes out.

summary(cdSc$pct_counts_Mt)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   4.498   6.376   6.939   8.246  94.326 

It is also valuable to examine how the QC metrics behave with respect to each other. Generally, they will be in rough agreement, i.e., cells with low total counts will also have low numbers of expressed features and high mitochondrial proportions.

par(mfrow=c(1,3))
plot(cdSc$total_features_by_counts, cdSc$total_counts/1e6, xlab="Number of expressed genes",
    ylab="Library size (millions)")
plot(cdSc$total_features_by_counts, cdSc$pct_counts_Mt, xlab="Number of expressed genes",
    ylab="Mitochondrial proportion (%)")
plot(cdSc$total_counts/1e6, cdSc$pct_counts_Mt, xlab="Total counts (in millions)",
    ylab="Mitochondrial proportion (%)")

Also how the counts are distributed across the cells..

plotRowData(cdSc, x = "n_cells_by_counts", y = "log10_total_counts", colour_by = "is_feature_control_Mt")

In the figure above we see that some of the mitochondrial genes (marked in orange) at the top right corner of the figure has very high log10_total_counts and also are expressed in a large number of cells (almost in all cells). This is not very surprising as some of the mitochondrial genes are indeed expressed across all the cells as they are producing energy. The worrying part would be if these genes takes majority of a cell’s read (more than 80%). Then I would remove those cells.

plotRowData(cdSc, x = "n_cells_by_counts", y = "log10_total_counts", colour_by = "pct_dropout_by_counts")

As expected, cells with low number of reads have higher dropout counts.

plotColData(cdSc, x = "log10_total_counts", y = "log10_total_features_by_counts", colour_by = "pct_counts_Mt")

Again cells with lower read counts and lower total feature counts have very high proportion of mitchondrial reads, clearly indicating that these cells either started apoptosis or broken abruptly. We can safely remove these cells in our QC filtering.

To see the QC metrics related with cells,

names(colData(cdSc))
 [1] "barcode"                                        "Sample"                                         "is_cell_control"                               
 [4] "total_features_by_counts"                       "log10_total_features_by_counts"                 "total_counts"                                  
 [7] "log10_total_counts"                             "pct_counts_in_top_50_features"                  "pct_counts_in_top_100_features"                
[10] "pct_counts_in_top_200_features"                 "pct_counts_in_top_500_features"                 "total_features_by_counts_endogenous"           
[13] "log10_total_features_by_counts_endogenous"      "total_counts_endogenous"                        "log10_total_counts_endogenous"                 
[16] "pct_counts_endogenous"                          "pct_counts_in_top_50_features_endogenous"       "pct_counts_in_top_100_features_endogenous"     
[19] "pct_counts_in_top_200_features_endogenous"      "pct_counts_in_top_500_features_endogenous"      "total_features_by_counts_feature_control"      
[22] "log10_total_features_by_counts_feature_control" "total_counts_feature_control"                   "log10_total_counts_feature_control"            
[25] "pct_counts_feature_control"                     "pct_counts_in_top_50_features_feature_control"  "pct_counts_in_top_100_features_feature_control"
[28] "pct_counts_in_top_200_features_feature_control" "pct_counts_in_top_500_features_feature_control" "total_features_by_counts_Mt"                   
[31] "log10_total_features_by_counts_Mt"              "total_counts_Mt"                                "log10_total_counts_Mt"                         
[34] "pct_counts_Mt"                                  "pct_counts_in_top_50_features_Mt"               "pct_counts_in_top_100_features_Mt"             
[37] "pct_counts_in_top_200_features_Mt"              "pct_counts_in_top_500_features_Mt"             

and to see the QC metrics related with genes,

names(rowData(cdSc))
[1] "symbol"                "is_feature_control"    "is_feature_control_Mt" "mean_counts"           "log10_mean_counts"     "n_cells_by_counts"    
[7] "pct_dropout_by_counts" "total_counts"          "log10_total_counts"   
plotColData(cdSc, x = "log10_total_features_by_counts", y="log10_total_counts", size_by ="total_counts_Mt", colour_by  = "pct_counts_Mt")

All the above figures clearly show that the high proportions of reads mapped to genes in the mitochondrial genome is indicative of poor quality cells (Ilicic et al., 2016; Islam et al., 2014), possibly because of increased apoptosis and/or loss of cytoplasmic RNA from lysed cells. I would be filtering those cells out.

The following figure gives an idea of how many genes are expressed in how many cells.

plotExprsFreqVsMean(cdSc)

The genes looks good as they are not yet filtered.

Now, picking up a threshold for different metrices in cell filtering is not straightforward as their absolute values depend on the protocol and biological system. For example, sequencing to greater depth will lead to more reads, regardless of the quality of the cells. To obtain an adaptive threshold, we assume that most of the dataset consists of high-quality cells. To facilitate in picking a threshold for our cell cutoff we now plot couple of densitiies.

cut_off_reads <- median(cdSc$log10_total_counts) - 3*mad(cdSc$log10_total_counts)
df <- data.frame(x=cdSc$log10_total_counts, Sample = cdSc$Sample)
plot_reads <- ggplot(df,
       aes(x = x, fill = as.factor(Sample))) + 
       geom_density(alpha = 0.5) +
       geom_vline(xintercept = cut_off_reads, colour="grey", linetype = "longdash") +
       labs(x = expression('log'[10]*'(Library Size)'), title = "Total reads density", fill = "Sample") + 
       theme_classic(base_size = 14)+
       scale_fill_manual(values=c_sample_col)
cut_off_mRNA <- median(cdSc$log10_total_features_by_counts) - 3*mad(cdSc$log10_total_features_by_counts)
df <- data.frame(x=cdSc$log10_total_features_by_counts, Sample = cdSc$Sample)
plot_mRNA <- ggplot(df,
       aes(x = x, fill = as.factor(Sample))) + 
       geom_density(alpha = 0.5) +
       geom_vline(xintercept = cut_off_mRNA, colour="grey", linetype = "longdash") +
       labs(x = expression('log'[10]*'(Number of expressed genes)'), title = "Total genes expressed", fill = "Sample") + 
       theme_classic(base_size = 14)+
       scale_fill_manual(values=c_sample_col)
cut_off_MT <- median(cdSc$pct_counts_Mt) + 3*mad(cdSc$pct_counts_Mt)
df <- data.frame(x=cdSc$pct_counts_Mt, Sample = cdSc$Sample)
plot_MT <- ggplot(df,
       aes(x = x, fill = as.factor(Sample))) + 
       geom_density(alpha = 0.5) +
       geom_vline(xintercept = cut_off_MT, colour="grey", linetype = "longdash") +
       labs(x = expression('Pct of MT Expression'), title = "Mitochondrial Expression", fill = "Sample") + 
       theme_classic(base_size = 14)+
       scale_fill_manual(values=c_sample_col)
multiplot(plot_reads, plot_MT, plot_mRNA,  cols=2)

df <- data.frame(x=cdSc$log10_total_counts)
as_tibble(df) %>%
  dplyr::mutate("Sample" = cdSc$Sample) %>%
  ggplot( aes(x = x, fill = as.factor(Sample))) + 
       geom_density(alpha = 0.5) +
       geom_vline(xintercept = cut_off_reads, colour="grey", linetype = "longdash") +
       labs(x = expression('log'[10]*'(Library Size)'), title = "Total reads density", fill = "Sample") + 
       theme_classic(base_size = 14)+
       scale_fill_manual(values=c_sample_col) + facet_wrap(~Sample, nrow=2,ncol=2)

The distribution of the reads, number of expressed genes and the percent of MT expression. The shaded line represents the threshold which is the 3 Median Absolute Deviation (MAD). For the top two figures, cells on the left of this threshold would be filtered out and for the bottom figure, cells on the right of this figure would be filtered out.

We see that in the library size plot and also the plot with number of expressed features, there are a biomodal distribution. In the previous cellranger versions we generally do not observe this. But with new cellranger it can capture cells with lower RNA contents. Due to this we are now getting cells with lower read counts.

We remove cells with log-library sizes that are more than 4 median absolute deviations (MADs) below the median log-library size. (A log-transformation improves resolution at small values, especially when the MAD of the raw values is comparable to or greater than the median). We also remove cells where the log-transformed number of expressed genes is 4 MADs below the median. Similarly, we drop the cells having pct of MT expression below 3 MAD.

Although from the cellranger output it looked that the cells have very different total counts, but after removing the extreme outlier cells it looks like they have quite nice packed distribution.

However, in the G samples higher number of cells have higher read counts as can be seen from the individual density. But that does not impact the cutoff value. Below we check the cutoff values to validate our filtering.

print(paste0('Read count Cutoff: ',10^cut_off_reads))
[1] "Read count Cutoff: 1289.0232114447"
print(paste0('Genes count Cutoff: ',10^cut_off_mRNA))
[1] "Genes count Cutoff: 864.220903908248"
print(paste0('MT percent count Cutoff: ',cut_off_MT))
[1] "MT percent count Cutoff: 14.7140899519496"

We now calculate the cells and genes that are deemed to be outliers due to falling below 3 MAD.

Cells having read counts below 1289 (ByLibSize) and number of genes expressed below 864 (ByFeature) would be dropped and if cells are having more than 14% of mitochondrial reads, I would drop the cells.

libsize.drop <- isOutlier(cdSc$total_counts, nmads=3, type="lower", log=TRUE)
feature.drop <- isOutlier(cdSc$total_features_by_counts, nmads=3, type="lower", log=TRUE)
mito.drop <- isOutlier(cdSc$pct_counts_Mt, nmads=3, type="higher")

Below are the stats for the cells that would be dropped

data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop), 
           ByMito=sum(mito.drop))

So, based on library size, 825 cells would be dropped and based on number of features expressed, 1210 cells would be dropped and for Mitochondrial proportion of reads 517 cells would be dropped. Please note that all these cells are removed from the total population and also many of these cells would be overlapping across conditions.

Generating, the scater object with filtered profile.

cdScFilt <- cdSc[,!(libsize.drop | feature.drop | mito.drop)]
cdScFilt <- calculateQCMetrics(cdScFilt)

Before filtering:

cdSc
class: SingleCellExperiment 
dim: 33538 20717 
metadata(0):
assays(2): counts logcounts
rownames(33538): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
rowData names(9): symbol is_feature_control ... total_counts log10_total_counts
colnames(20717): AAACCCACAGTTAGGG-1 AAACCCACATGTGTCA-1 ... TTTGTTGGTCCTGGTG-4 TTTGTTGTCAGATTGC-4
colData names(38): barcode Sample ... pct_counts_in_top_200_features_Mt pct_counts_in_top_500_features_Mt
reducedDimNames(0):
spikeNames(0):

After filtering:

cdScFilt
class: SingleCellExperiment 
dim: 33538 19248 
metadata(0):
assays(2): counts logcounts
rownames(33538): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
rowData names(9): symbol is_feature_control_Mt ... total_counts log10_total_counts
colnames(19248): AAACCCACAGTTAGGG-1 AAACCCACATGTGTCA-1 ... TTTGTTGGTCCTGGTG-4 TTTGTTGTCAGATTGC-4
colData names(38): barcode Sample ... pct_counts_in_top_200_features pct_counts_in_top_500_features
reducedDimNames(0):
spikeNames(0):

Before filtering number of cells were,

print(paste0("Cells before filtering: ",dim(cdSc)[2]))
[1] "Cells before filtering: 20717"

After filtering remaining cells are:

print(paste0("Cells remaining after filtering: ",dim(cdScFilt)[2]))
[1] "Cells remaining after filtering: 19248"

For the four samples, number of cells that are remaining are:

print(levels(as.factor(colData(cdSc)$Sample)))
[1] "Patient1" "Patient2" "Patient3" "Patient4"
print(paste0('Before Filtering: ', table(colData(cdSc)$Sample)))
[1] "Before Filtering: 5756" "Before Filtering: 3974" "Before Filtering: 6433" "Before Filtering: 4554"
print(paste0('After Filtering: ', table(colData(cdScFilt)$Sample)))
[1] "After Filtering: 5317" "After Filtering: 3804" "After Filtering: 6084" "After Filtering: 4043"

So, the cells that are being filtered out are kind of evenly distributed across the samples. This is kind of reassuring that there was not a point failure where one sample totally failed. After filtering we have in total 21609 cells remaining for downstream analysis which is a very reasonable number.

Now, I will again run couple of QCs to see whether further filtering is required.

Relationship between total counts and total features before filtering:

p <- plotColData(cdSc, x="log10_total_counts",y="total_features_by_counts", colour_by = "Sample") +
       scale_fill_manual(values=c_sample_col)
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p

After filtering:

p <- p <- plotColData(cdScFilt, x="log10_total_counts",y="total_features_by_counts", colour_by = "Sample") +
       scale_fill_manual(values=c_sample_col)
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p

We could see that some of the outlier cells have been dropped.

Before filtering:

plotColData(cdSc, x = "log10_total_counts", y = "log10_total_features_by_counts", colour_by = "pct_counts_Mt")

After filtering:

plotColData(cdScFilt, x = "log10_total_counts", y = "log10_total_features_by_counts", colour_by = "pct_counts_Mt")

We see that the good cells are kept while cells with very high Mt reads are being removed.

Now to see about the higher read depth cells I plot violin plots. This would give me an indication as whether there is any obvious outliers. I will be plotting the cells for TotalCounts and TotalFeaturesMitochondria.

df <- data.frame(Cell=colnames(cdScFilt), CellType=cdScFilt$Sample, totalFeatures=cdScFilt$total_features_by_counts, totalCount=cdScFilt$total_counts, PctTotalCountMt=cdScFilt$pct_counts_Mt, Sample=cdScFilt$Sample)
p1 <- ggplot(df, aes(factor(CellType),totalCount,colour=Sample))
p1 <- p1 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))
p2 <- ggplot(df, aes(factor(CellType),PctTotalCountMt,colour=Sample))
p2 <- p2 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))
p3 <- ggplot(df, aes(factor(CellType),totalFeatures,colour=Sample))
p3 <- p3 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))
multiplot(p1,p2,p3,cols = 2)

It clearly looks like that two cells above total counts of 30000 are outliers. This again would be removed as it would otherwise might be multiplets

cdScFilt
class: SingleCellExperiment 
dim: 33538 19248 
metadata(0):
assays(2): counts logcounts
rownames(33538): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
rowData names(9): symbol is_feature_control_Mt ... total_counts log10_total_counts
colnames(19248): AAACCCACAGTTAGGG-1 AAACCCACATGTGTCA-1 ... TTTGTTGGTCCTGGTG-4 TTTGTTGTCAGATTGC-4
colData names(38): barcode Sample ... pct_counts_in_top_200_features pct_counts_in_top_500_features
reducedDimNames(0):
spikeNames(0):
cdScFilt <- cdScFilt[,!(cdScFilt$total_counts > 30000)]
cdScFilt
class: SingleCellExperiment 
dim: 33538 19246 
metadata(0):
assays(2): counts logcounts
rownames(33538): MIR1302-2HG FAM138A ... AC213203.1 FAM231C
rowData names(9): symbol is_feature_control_Mt ... total_counts log10_total_counts
colnames(19246): AAACCCACAGTTAGGG-1 AAACCCACATGTGTCA-1 ... TTTGTTGGTCCTGGTG-4 TTTGTTGTCAGATTGC-4
colData names(38): barcode Sample ... pct_counts_in_top_200_features pct_counts_in_top_500_features
reducedDimNames(0):
spikeNames(0):

Drawing the vuiolin plot again

df <- data.frame(Cell=colnames(cdScFilt), CellType=cdScFilt$Sample, totalFeatures=cdScFilt$total_features_by_counts, totalCount=cdScFilt$total_counts, PctTotalCountMt=cdScFilt$pct_counts_Mt, Sample=cdScFilt$Sample)
p1 <- ggplot(df, aes(factor(CellType),totalCount,colour=Sample))
p1 <- p1 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))
p2 <- ggplot(df, aes(factor(CellType),PctTotalCountMt,colour=Sample))
p2 <- p2 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))
p3 <- ggplot(df, aes(factor(CellType),totalFeatures,colour=Sample))
p3 <- p3 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))
multiplot(p1,p2,p3,cols = 2)

Looks a good distribution to move forward.

save.image()

Classification of cell cycle phase

We use the prediction method described by Scialdone et al. (2015) to classify cells into cell cycle phases based on the gene expression data. Using a training dataset, the sign of the difference in expression between two genes was computed for each pair of genes. Pairs with changes in the sign across cell cycle phases were chosen as markers. Cells in a test dataset can then be classified into the appropriate phase, based on whether the observed sign for each marker pair is consistent with one phase or another. We do the cell cycle classification before gene filtering as this provides more precise cell cycle phase classifications. This approach is implemented in the Cyclone function using a pre-trained set of marker pairs for human data. Some additional work is necessary to match the gene symbols in the data to the Ensembl annotation in the pre-trained marker set.

cdScFiltAnnot <- cdScFilt
hg.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
library(org.Hs.eg.db)
Loading required package: AnnotationDbi

Attaching package: ‘AnnotationDbi’

The following object is masked from ‘package:dplyr’:

    select
anno <- select(org.Hs.eg.db, keys=as.character(rownames(cdScFiltAnnot)), keytype="SYMBOL", column="ENSEMBL")
'select()' returned 1:many mapping between keys and columns
ensembl <- anno$ENSEMBL[match(as.character(rownames(cdScFiltAnnot)), anno$SYMBOL)]
assignments <- cyclone(cdScFiltAnnot, hg.pairs, gene.names=ensembl)
df <- data.frame(x=assignments$score$G1, y=assignments$score$G2M, Sample=colData(cdScFilt)$Sample)
 p<-ggplot(data=df, aes(x=x,y=y,color=Sample)) + 
    geom_point(size=0.5)+
    xlab("G1 score")+
    ylab("G2M score")+
    ylim(0,1)+
    xlim(0,1)+
    ggtitle("Cell-cycle effects")+
    theme_light(base_size=15)+
    theme(axis.title.x = element_text(size=10, vjust=-2),
          axis.text.x  = element_text( size=10),
          axis.title.y = element_text( size=10,vjust=2),
          axis.text.y  = element_text( size=10)) +
    theme(plot.margin=unit(c(1,1,1.5,1.2),"cm"))+
    theme(legend.text=element_text(size=10),#size of legend
          legend.title=element_text(size=10), 
          plot.title = element_text(size=20, face="bold")) +
    scale_colour_manual(values=c_sample_col) + 
    #scale_color_discrete(name=legend.title)+
    geom_segment(aes(x = 1/2, y = 0, xend=1/2, yend=1/2),colour="black") + 
    geom_segment(aes(x = 0, y = 1/2, xend=1/2, yend=1/2),colour="black") +
    geom_segment(aes(x = 1/2, y = 1/2, xend=1, yend=1),colour="black") +
    annotate("text", x=0.05, y=0.05, label="S", size=8)+
    annotate("text", x=0.95, y=0.25, label="G1", size=8)+
    annotate("text", x=0.25, y=0.95, label="G2M", size=8)
print(p)

smoothScatter(assignments$score$G1, assignments$score$G2M, xlab="G1 score", ylab="G2/M score", pch=16, cex=0.6)

Cells are classified as being in G1 phase if the G1 score is above 0.5 and greater than the G2/M score; in G2/M phase if the G2/M score is above 0.5 and greater than the G1 score; and in S phase if neither score is above 0.5. Here, the vast majority of cells are classified as being in G1 phase.

It looks like majority of the cells are in high G2M or S score indicating that the cells are INDEED going through cell-cycle stages. This is we would also expect from Cancer cells.

This method would be less accurate for data that are substantially different from those used in the training set, e.g., due to the use of a different protocol. This dataset uses UMI counts, which has an entirely different set of biases, e.g., 3’-end coverage only, no length bias, no amplification noise. These new biases (and the absence of expected biases) may interfere with accurate classification of some cells. So we are not particularly sure about this result. Nevertheless we need to keep in mind that there could be quite high cell-cycle effect which might confound the dataset. To avoid problems from misclassification, I will not perform any processing of this dataset by cell cycle phase. This is unlikely to be problematic for this analysis, as the cell cycle effect will be relatively subtle compared to the obvious differences between cell types in a diverse population. Thus, the former is unlikely to distort the conclusions regarding the latter.

colData(cdScFilt)$CellCycle <- assignments$phases
colData(cdScFiltAnnot)$CellCycle <- assignments$phases
plotPCA(cdScFiltAnnot, colour_by = "CellCycle", shape_by="Sample")

It does not look like one patient is dominated by a cell-cycle phase.

Filtering out low-abundance genes

Low-abundance genes are problematic as zero or near-zero counts do not contain enough information for reliable statistical inference (Bourgon et al., 2010). In addition, the discreteness of the counts may interfere with downstream statistical procedures, e.g., by compromising the accuracy of continuous approximations.

Generally for low-abundance genes are defined as those with an average count below a filter threshold of 1. But 10X Chromium is based on UMI counts, which would have understandably low counts, so setting the threshold to 1 would filter a large number of cells. Here I am going to set it to 0.01. The figure below will explain the reason.

These genes are likely to be dominated by drop-out events (Brennecke et al., 2013), which limits their usefulness in later analyses. Removal of these genes mitigates discreteness and reduces the amount of computational work without major loss of information.

ave.counts <- rowMeans(counts(cdScFilt))
keep <- ave.counts >= 0.01
sum(keep)
[1] 13210

After filtering with average count of 0.01 there are 13210 genes left for the downsteram analysis.

To check whether the chosen threshold is suitable, we examine the distribution of log-means across all genes (Figure below). Generally for higher number of cells there is a peak on the right hand side that represents the bulk of moderately expressed genes while in the middle there is a rectangular component that corresponds to lowly expressed genes. The filter threshold should cut the distribution at some point along the rectangular component to remove the majority of low-abundance genes. As the blue line repsresents in the figure below, it cuts the counts at the rectangular component.

hist(log10(ave.counts), breaks=100, main="", col="grey80",
     xlab=expression(Log[10]~"average count"))
abline(v=log10(0.01), col="blue", lwd=2, lty=2)

We will look at the identities of the most highly expressed genes before filtering them. This should generally be dominated by constitutively expressed transcripts, such as those for ribosomal or mitochondrial proteins. The presence of other classes of features may be cause for concern if they are not consistent with expected biology. For example, the absence of ribosomal proteins and/or the presence of their pseudogenes are indicative of suboptimal alignment.

plotHighestExprs(cdScFiltAnnot)

pdf('Highest_expression_50Gene.pdf')
plotHighestExprs(cdScFiltAnnot)
dev.off()
null device 
          1 

Percentage of total counts assigned to the top 50 most highly-abundant features in the dataset.

For each feature, each bar represents the percentage assigned to that feature for a single cell, while the circle represents the average across all cells. Bars are coloured by the total number of expressed features in each cell, while circles are coloured according to whether the feature is labelled as a control feature.

Do the topmost genes appear to agree with the biology?

Louisa: Please look at the genes whether those make sense…._

Now for the filteirng, I generally prefer the mean-based filter as it tends to be less aggressive. A gene will be retained as long as it has sufficient expression in any subset of cells. Genes expressed in fewer cells require higher levels of expression in those cells to be retained, but this is not undesirable as it avoids selecting uninformative genes (with low expression in few cells) that contribute little to downstream analyses, e.g., HVG detection or clustering. In contrast, the “at least n” filter depends heavily on the choice of n. With n = 10, a gene expressed in a subset of 9 cells would be filtered out, regardless of the level of expression in those cells. This may result in the failure to detect rare subpopulations that are present at frequencies below n. While the mean-based filter will retain more outlier-driven genes, this can be handled by choosing methods that are robust to outliers in the downstream analyses.

Thus, we apply the mean-based filter to the data by subsetting the SCESet object as shown below. This removes all rows corresponding to endogenous genes or spike-in transcripts with abundances below the specified threshold.

cdScFilt <- cdScFilt[keep,]
cdScFiltAnnot <- cdScFilt
#cdScFiltAnnot <- calculateQCMetrics(cdScFiltAnnot)
cdScFiltAnnot
class: SingleCellExperiment 
dim: 13210 19246 
metadata(0):
assays(2): counts logcounts
rownames(13210): AL669831.5 LINC00115 ... AL354822.1 AC240274.1
rowData names(9): symbol is_feature_control_Mt ... total_counts log10_total_counts
colnames(19246): AAACCCACAGTTAGGG-1 AAACCCACATGTGTCA-1 ... TTTGTTGGTCCTGGTG-4 TTTGTTGTCAGATTGC-4
colData names(39): barcode Sample ... pct_counts_in_top_500_features CellCycle
reducedDimNames(0):
spikeNames(0):

The filtering now lets see how the total counts and features plots:

p <- plotColData(cdScFilt, x="log10_total_counts",y="total_features_by_counts", colour_by = "Sample")  + scale_fill_manual(values=c_sample_col) 
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
p

What is the frequency of the cells with mean expression?:

plotExprsFreqVsMean(cdScFiltAnnot)

Overall looks a reasonable number of genes expressed in 50% of cells.

p <- plotColData(cdScFilt, x="total_counts",y="total_features_by_counts", colour_by = "Sample") + scale_colour_manual(values=c_sample_col) 
p2 <- plotRowData(cdScFilt, x = "log10_total_counts", y = "n_cells_by_counts", colour_by = "pct_dropout_by_counts")
multiplot(plotPCA(cdScFilt, colour_by="Sample",
     size_by="total_features_by_counts") + scale_fill_manual(values=c_sample_col) ,
          p,
          p2,
          cols=2)
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.

summary(rowData(cdScFilt)$pct_dropout_by_counts)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 0.05195 74.86492 88.77546 81.49701 95.74891 99.71945 

Some of the genes in the top right figure still has high drop-outs. This is not unusual for a single-cell RNA-seq as the data is very sparse and specially when using UMI based counts. If there are genes that have very high read counts, but are then expressed only very few cells, that would be worrying as it would indicate that those cells are behaving abruptly.

So things look fine in QC.

Normalization of cell-specific biases

Now, we would be doing step size based normalization. One of the widely used step-wise normalization technique for Bulk data is the one used by DESeq2. However, as the single cell data is very sparse we cannot use this method. The other very efficient method is the deconvolution based method which we would be using here.

Using the deconvolution method to deal with zero counts: Read counts are subject to differences in capture efficiency and sequencing depth between cells (Stegle et al., 2015). Normalization is required to eliminate these cell-specific biases prior to downstream quantitative analyses. This is often done by assuming that most genes are not differentially expressed (DE) between cells. Any systematic difference in count size across the non-DE majority of genes between two cells is assumed to represent bias and is removed by scaling. More specifically, “size factors” are calculated that represent the extent to which counts should be scaled in each library.

Single-cell data can be problematic due to the dominance of low and zero counts. To overcome this, we pool counts from many cells to increase the count size for accurate size factor estimation (Lun et al., 2016). Pool-based size factors are then “deconvolved” into cell-based factors for cell-specific normalization.

cdScFiltAnnot <- computeSumFactors(cdScFiltAnnot, sizes=c(100, 200, 300, 400, 440, 480, 500, 550))
summary(sizeFactors(cdScFiltAnnot))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.06796 0.65800 0.93359 1.00000 1.24521 5.71193 

If the size factors are tightly correlated with the library sizes for all cells this would suggests that the systematic differences between cells are primarily driven by differences in capture efficiency or sequencing depth. Any DE between cells would yield a non-linear trend between the total count and size factor, and/or increased scatter around the trend.

DF <- data.frame(VAR1=sizeFactors(cdScFiltAnnot), VAR2=cdScFiltAnnot$total_counts/1e6)
model = lm(VAR2 ~ VAR1, DF)
summary(model)

Call:
lm(formula = VAR2 ~ VAR1, data = DF)

Residuals:
       Min         1Q     Median         3Q        Max 
-0.0083548 -0.0007298 -0.0001815  0.0005718  0.0087967 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 7.185e-04  1.911e-05   37.59   <2e-16 ***
VAR1        6.324e-03  1.713e-05  369.10   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.001175 on 19244 degrees of freedom
Multiple R-squared:  0.8762,    Adjusted R-squared:  0.8762 
F-statistic: 1.362e+05 on 1 and 19244 DF,  p-value: < 2.2e-16
sp = ggplot(DF, aes(x=VAR1, y=VAR2))
sp + geom_point() + stat_smooth(method=lm) + theme_light(base_size=15) +
        theme(strip.background = element_blank(),
              panel.border     = element_blank(),
              plot.title = element_text(hjust = 0.5)) +  
              xlab("Size Factor") + 
              ylab("Library Size (millions)")+
              annotate("text", label="R^2=0.8762", x=2.8, y=0.075)

With a high R^2 value, the size factors are very tightly correlated with the library size. I calculate the size factor normization and put it in a slot of SummarizedExperiment object.

Applying the size factors to normalize gene expression:

The count data are used to compute normalized log-expression values for use in downstream analyses. Each value is defined as the log-ratio of each count to the size factor for the corresponding cell, after adding a prior count of 1 to avoid undefined values at zero counts. Division by the size factor ensures that any cell-specific biases are removed. If spike-in-specific size factors are present in sce, they will be automatically applied to normalize the spike-in transcripts separately from the endogenous genes.

#cdScFiltAnnot<- normalize(cdScFiltAnnot)
norm_exprs(cdScFiltAnnot)<- scater::normalize(cdScFiltAnnot)

Re-calculate the CPM normalization

I recalculate the counts-per-million for these cells. This is because large number of genes have been filtered out which would have impacted the library size when CPM was calcualted with unfiltered data. Now, as the number of genes have been reduced, so is the library size.

# cpm(cdScFiltAnnot) <- log2(calculateCPM(cdScFiltAnnot, use_size_factors = TRUE) + 1)
exprs(cdScFiltAnnot) <- log2(calculateCPM(cdScFiltAnnot, use_size_factors = TRUE) + 1)

I set the parameter use.size.factors = TRUE. This would construct the effective library size instead of the sum of counts as the library size. This effective library size is defined based on the step size calcualted as described in the pool based method paper.

The log-transformation provides some measure of variance stabilization (Law et al., 2014), so that high-abundance genes with large variances do not dominate downstream analyses. The computed values are stored as an exprs matrix in addition to the other assay elements.

Checking for important technical factors

We check whether there are technical factors that contribute substantially to the heterogeneity of gene expression. If so, the factor may need to be regressed out to ensure that it does not inflate the variances or introduce spurious correlations. For this dataset, the simple experimental design means that there are no plate or batch effects to examine. However there could be other technical factors like the cell-cycle effect or dropouts

plotExplanatoryVariables(cdScFiltAnnot, variables=c("total_features_by_counts", "total_counts", "CellCycle", "pct_counts_Mt", "Sample"))

Among the important technical factors, we can see that number of total features (genes) and total counts are explaining the major variability in the dataset. This confirms that cells would be mostly varied based on number of genes that they are expressing. In the cell-cycle plot showed earlier, we showed that there is cell-cycle effect but it might not be confounding the analysis.

Number of total features explaining majority of the variability is a common issue in single-cell RNA-seq. This could be due to the sparsity of the data. We need to keep in mind that cells in the PCA plot might be seperating just because they have very different number of features expressed in total.

However, I am expecting this would not confound the t-SNE plot that would take the non-linear relationship between the variables.

Identifying HVGs from the normalized log-expression

This is one of the very important excercies.

I now identify HVGs to focus on the genes that are driving heterogeneity across the population of cells. This requires estimation of the variance in expression for each gene, followed by decomposition of the variance into biological and technical components. HVGs are then identified as those genes with the largest biological components. This avoids prioritizing genes that are highly variable due to technical factors, such as sampling noise during RNA capture and library preparation.

In the recent implementation of seurat, Rahul Satija took a slightly different approach for HVG calculation. They calculate the variance and mean for each gene in the dataset (storing this in object@hvg.info), and sorts genes by their variance/mean ratio (VMR). They have observed that for large-cell datasets with unique molecular identifiers, selecting highly variable genes (HVG) simply based on VMR is an efficient and robust strategy.

But in our implementation we fit the trend to the variance estimates of the endogenous genes, using the use.spikes=FALSE setting as shown below. This assumes that the majority of genes are not variably expressed, such that the technical component dominates the total variance for those genes. The fitted value of the trend is then used as an estimate of the technical component.

span: Low-abundance genes with mean log-expression below min.mean are not used in trend fitting, to preserve the sensitivity of span-based smoothers at moderate-to-high abundances. It also protects against discreteness, which can interfere with estimation of the variability of the variance estimates and accurate scaling of the trend. The default threshold is chosen based on the point at which discreteness is observed in variance estimates from Poisson-distributed counts. For heterogeneous droplet data, a lower threshold of 0.001-0.01 should be used.

var.fit <- trendVar(cdScFiltAnnot, method="loess", use.spikes=FALSE, min.mean=1.0)
var.out <- decomposeVar(cdScFiltAnnot, var.fit)

We assess the suitability of the trend fitted to the endogenous variances by examining whether it is consistent with the variances. The trend passes through or close to most of the endogenous gene variances, indicating that our assumption (that most genes have low levels of biological variability) is valid. This strategy exploits the large number of endogenous genes to obtain a stable trend.

plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", ylab="Variance of log-expression")
o <- order(var.out$mean)
lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2)

Ideally the plot above would look like the mountain, where it would have rise in the middle but then would be dropping at the end as we have low variance for highly expressed genes. The trend line would follow it as well.

HVGs are defined as genes with biological components that are significantly greater than zero at a false discovery rate (FDR) of 5%. These genes are interesting as they drive differences in the expression profiles between cells, and should be prioritized for further investigation. In addition, I only consider a gene to be a HVG if it has a biological component greater than or equal to 0.5. For transformed expression values on the log2 scale, this means that the average difference in true expression between any two cells will be at least 2-fold. (This reasoning assumes that the true log-expression values are Normally distributed with variance of 0.5. The root-mean-square of the difference between two values is treated as the average log2-fold change between cells and is equal to unity.) I rank the results by the biological component to focus on genes with larger biological variability.

hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$bio >= 0.5),]
hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),]
nrow(hvg.out)
[1] 632

So, there are 632 HVGs for this dataset.

Plotting the HVGs. The red dots are the HVGs we selected.

plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
     ylab="Variance of log-expression")
o <- order(var.out$mean)
lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2)
points(var.out[rownames(hvg.out),]$mean, var.out[rownames(hvg.out),]$total, col="red", pch=16, cex=0.6)

Plotting the top 15 HVGs:

plotExpression(cdScFiltAnnot, rownames(hvg.out)[1:15])

Discuss meaning of genes.

write.table(file="HVG_Louisa_Nelson_10X_Dataset.tsv", hvg.out, sep="\t", quote = FALSE, col.names = NA)
head(hvg.out, 20)
DataFrame with 20 rows and 6 columns
                     mean            total              bio             tech               p.value                   FDR
                <numeric>        <numeric>        <numeric>        <numeric>             <numeric>             <numeric>
TIMP1    8.22822611977597 23.2445946615811 13.4772653842641 9.76732927731701                     0                     0
KRT81    3.79280737225925 26.3679279637404 11.4369375039588 14.9309904597816                     0                     0
TPT1     9.99163026093074 18.3841149265066 10.5915443828566 7.79257054365002                     0                     0
FN1      7.41597123112208 21.4155024164406  10.560609699778 10.8548927166626                     0                     0
IFI27    4.60730444445383 23.7733388099039 8.62808568555934 15.1452531243445                     0                     0
...                   ...              ...              ...              ...                   ...                   ...
TFPI2    5.29991228294919 19.7089953327959 5.56456654848151 14.1444287843144 2.25433612863366e-260 1.24082417746878e-257
IGFBP5   2.17462766498083 17.1101529380375 5.54840396255214 11.5617489754854                     0                     0
SERPINE1 8.10494233613672 15.2842598771331 5.37040136183392 9.91385851529918                     0                     0
SERPINE2 3.52377909901619 19.9367397348913 5.27800359109321 14.6587361437981 3.35932681708548e-222 1.47922357512331e-219
CCDC80    5.4689651815356 19.0482949418358 5.26705547316059 13.7812394686752 2.93700403144984e-247 1.49222397136356e-244

There are other alternative approaches for determining the HVGs specially those based on Coefficient of Variance. Here I use the variance of the log-expression values because the log-transformation protects against genes with strong expression in only one or two cells. This ensures that the set of top HVGs is not dominated by genes with (mostly uninteresting) outlier expression patterns.

However, I would like to also test other methods of identifying the HVGs. It has been mentioned that fitting the trendline to endogenous genes might not always be a good idea.

Downstream analysis with HVG genes

I will see instead of taking the correlated genes how the result comes up with the HVGs only.

rowVarsSorted <- exprs(cdScFiltAnnot)[rownames(hvg.out),]
FinalPCAData <- t(rowVarsSorted)
pcaPRComp <- prcomp(FinalPCAData)
nmax = 10
if(dim(pcaPRComp$x)[1] < 10){ nmax = dim(pcaPRComp$x)[1] }
txt1 <- paste("Percent_PC_Var_onfirst",nmax,"PCs",sep="")
pca_var = pcaPRComp$sdev ^ 2
pca_var_percent <- 100 * pca_var / sum(pca_var)
pca_var_percent_first10 <- NA * pca_var
pca_var_percent_first10[1:nmax] <- 100 * pca_var[1:nmax] / sum(pca_var[1:nmax])
#pca_corr_reads <- apply(pcaPRComp$x,2,function(x) cor(x,report_sub$Assigned))
    
pca_var_out <- data.frame(round(pca_var,3),round(pca_var_percent,1),
                          round(pca_var_percent_first10,1))
#rownames(pca_var_out) <- rownames(pcaPRComp$x)
colnames(pca_var_out) <- c("PC_Var","PC_Var_percent",txt1)
nColToDisplay = 5
df <- as.data.frame(pcaPRComp$x)
df$Cell=as.factor(colData(cdScFiltAnnot)$Sample)
p <- ggpairs(df, columns=1:nColToDisplay, upper=list(continuous="points"), 
             title='Plotting first four PCAs', 
             mapping = aes_string(color="Cell"),
             legend = c(1,nColToDisplay),
             columnLabels = as.character(paste0(colnames(df[,1:nColToDisplay]), ' : ', 
                                                pca_var_out$PC_Var_percent[1:nColToDisplay], '% variance')))+
    theme_light(base_size=15)+     
    theme(plot.title = element_text(hjust = 0.5))
for(i in 1:p$nrow) {
  for(j in 1:p$ncol){
    p[i,j] <- p[i,j] + 
        scale_fill_manual(values=c_sample_col) +
        scale_colour_manual(values=c_sample_col)  
  }
}
    
print(p)

The results does not do much with the chosen and the HVG ones. So for the downstream analysis I will take the HVG genes only instead of the correlated HVGs.

t-SNE plot with only the HVG genes:

tsne_out <- Rtsne(as.matrix(pcaPRComp$x[,1:14]),check_duplicates = FALSE, pca = FALSE,
             perplexity=30, theta=0.01, dims=2, num_threads = 8)
Rep <- as.factor(colData(cdScFiltAnnot)$Sample)
counts <- colData(cdScFiltAnnot)$total_counts
p <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=counts)) +
     geom_point(size=0.75) +
     guides(colour = guide_legend(override.aes = list(size=0.5))) +
     xlab("") + ylab("") +
     ggtitle("t-SNE 2D Embedding of Sample Data") +
     theme_classic(base_size=14) +
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank())
p2 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=Rep)) +
     geom_point(size=0.75) +
     guides(colour = guide_legend(override.aes = list(size=0.8))) +
     xlab("") + ylab("") +
     ggtitle("t-SNE 2D Embedding of Expression Data") +
     theme_classic(base_size=10) +
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank()) +
          scale_fill_manual(values=c_sample_col) +
         scale_colour_manual(values=c_sample_col)
p2

#multiplot(p,p2,cols=2)
save.image()

So the t-SNe plot clearly seperates the patients. This is what we generally see when we have multiple patients. The gene expression profile for every patient itself is very different for cells and that is why cells from different patients come differently. One interesting thing is a cluster with all the cells mixed. Now, this could be the 25% of stromal cells from each patient where as the heterogeneity in tumour samples seperate the patients. ### Running Umap for data visualization

embedding <- umap(pcaPRComp$x[,1:14])
as.tibble(embedding$layout) %>%
  mutate(Samples = colData(cdScFiltAnnot)$Sample) %>%
  ggplot(aes(V1, V2, color=Samples)) +  geom_point(size=0.75) + theme_classic() +scale_colour_manual(values=c_sample_col) 

Clustering

There are numerous methods of clustering, the dynamic-cut-tree, the k-mean, k-medoids, GMM, GP-LVM and so on. I will straight jump into dynamic-cut-tree as this is the one that performed best while I was doing analysis with the naive and Aspd12 dataset.

Dynamic Cut Tree

Hierarchical clustering is a widely used method for detecting clusters in genomic data. Clusters are defined by cutting branches off the dendrogram. A common but inflexible method uses a constant height cutoff value; this method exhibits suboptimal performance on complicated dendrograms. We present the Dynamic Tree Cut R library that implements novel dynamic branch cutting methods for detecting clusters in a dendrogram depending on their shape. Compared to the constant height cutoff method, our techniques offer the following advantages: (1) they are capable of identifying nested clusters; (2) they are flexible — cluster shape parameters can be tuned to suit the application at hand; (3) they are suitable for automation; and (4) they can optionally combine the advantages of hierarchical clustering and partitioning around medoids, giving better detection of outliers.

Clustering cells into putative subpopulations

We perform hierarchical clustering on the Euclidean distances between cells, using Ward’s criterion to minimize the total variance within each cluster. This yields a dendrogram that groups together cells with similar expression patterns across the chosen genes. An alternative approach is to cluster on a matrix of distances derived from correlations (e.g., as in quickCluster). This is more robust to noise and normalization errors, but is also less sensitive to subtle changes in the expression profiles.

chosen.exprs <- logcounts(cdScFiltAnnot[rownames(hvg.out),])
my.dist <- dist(t(chosen.exprs))
my.tree <- hclust(my.dist, method = "ward.D2")

Clusters are explicitly defined by applying a dynamic tree cut (Langfelder et al., 2008)[https://academic.oup.com/bioinformatics/article/24/5/719/200751] to the dendrogram. This exploits the shape of the branches in the dendrogram to refine the cluster definitions, and is more appropriate than cutree for complex dendrograms. Greater control of the empirical clusters can be obtained by manually specifying cutHeight in cutreeDynamic.

library(dynamicTreeCut)
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0))

Number of clusters chosen by this method is:

levels(as.factor(my.clusters))
 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
cdScFiltAnnot$Clusters <- as.factor(my.clusters)

Drawing the heatmap, first scale te:

library(ComplexHeatmap)
Loading required package: grid
========================================
ComplexHeatmap version 2.0.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite:
Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
  genomic data. Bioinformatics 2016.
========================================
heat.vals <- chosen.exprs - rowMeans(chosen.exprs)
#clust.col <- rainbow(max(my.clusters))
#heatmap.2(heat.vals, col=bluered, symbreak=TRUE, trace='none', cexRow=0.3,
#    ColSideColors=clust.col[my.clusters], Colv=as.dendrogram(my.tree))
df = data.frame(Class = as.factor(my.clusters))
ha = HeatmapAnnotation(df = df)
#ha = HeatmapAnnotation(df = df, col = list(Condition = c("MC_A" =  "dodgerblue", "MC_B"= "firebrick", "MC_C"="forestgreen", "MC_D"="gold")))
Heatmap(heat.vals , top_annotation = ha, show_column_names=FALSE, cluster_rows = TRUE, clustering_method_columns = "ward.D2", row_names_gp = gpar(fontsize = 8))

Saving the heatmap for later exploration:

pdf("Matthew_Hepworth_heat_newDataset.pdf", width=20, height=40)
Heatmap(heat.vals , top_annotation = ha, show_column_names=FALSE, cluster_rows = TRUE, clustering_method_columns = "ward.D2", row_names_gp = gpar(fontsize = 8))
dev.off()
null device 
          1 

t-SNE with cluster allocation:

Clusters <- as.factor(my.clusters)
p2 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=Clusters)) +
     geom_point(size=1.0) +
     guides(colour = guide_legend(override.aes = list(size=1))) +
     xlab("") + ylab("") +
     ggtitle("t-SNE 2D Embedding of cluster Data") +
     theme_classic(base_size=10) +
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank()) + scale_color_manual(values=c_clust_col)
p2

#multiplot(p,p2,cols=2)
Clusters <- as.factor(my.clusters)
as.tibble(embedding$layout) %>%
  mutate(Clusters = Clusters) %>%
  ggplot(aes(V1, V2, color=Clusters)) +  geom_point(size=0.75) + theme_classic() + scale_color_manual(values=c_clust_col)

as.tibble(embedding$layout) %>%
  mutate(Samples = colData(cdScFiltAnnot)$Sample) %>%
  ggplot(aes(V1, V2, color=Samples)) +  geom_point(size=0.75) + theme_classic() + scale_color_manual(values=c_clust_col)

How the clustering is impacted with read counts

as.tibble(embedding$layout) %>%
  mutate(ReadDepth = cdScFiltAnnot$log10_total_counts) %>%
  ggplot(aes(V1, V2, color=ReadDepth)) +  geom_point(size=0.75) + theme_classic() +
  scale_colour_gradientn(colours = rainbow(5))

as.tibble(as.data.frame(tsne_out$Y)) %>%
  mutate(ReadDepth = cdScFiltAnnot$log10_total_counts) %>%
  ggplot(aes(V1, V2, color=ReadDepth)) +  geom_point(size=0.75) + theme_classic() +
  scale_colour_gradientn(colours = rainbow(5))

p1<-as.tibble(as.data.frame(tsne_out$Y)) %>%
  mutate(ReadDepth = cdScFiltAnnot$log10_total_counts) %>%
  ggplot(aes(V1, V2, color=ReadDepth)) +  geom_point(size=0.75) + theme_classic() +
  scale_colour_gradientn(colours = rainbow(5))
p2<-as.tibble(as.data.frame(tsne_out$Y)) %>%
  mutate(Clusters = cdScFiltAnnot$Clusters) %>%
  ggplot(aes(V1, V2, color=Clusters)) +  geom_point(size=0.75) + theme_classic() + scale_color_manual(values=c_clust_col)
#par(mfrow=c(1,2))
p1 

p2

One of my worry was that the clusters and the t-SNE seperation are dominated by the read depth. Although in t-SNE one can see the gradient of the Read Depth (read depths moving into one side) which has been reported in the literature Hafemeister et. al. 2019 but it surely is not dominating the clusters on the t-SNE seperation.

Cluster validation

It looks like that the clustering method that I prefer to use dynamicTreeCut is producing 12 clusters after removing the cells from 12th cluster from the first run. In this new clustering cluster-8 is spread everywhere and does not seem to be a valid cluster. So, I will not apply clustering validation methods to see this.

Internal measures for cluster validation

In this section, we describe the most widely used clustering validation indices. Recall that the goal of partitioning clustering algorithms (Part @ref(partitioning-clustering)) is to split the data set into clusters of objects, such that:

  • the objects in the same cluster are similar as much as possible,
  • and the objects in different clusters are highly distinct Cluster Validation-1 Cluster Validation-2 Cluster Validation-3

  • That is, we want the average distance within cluster to be as small as possible; and the average distance between clusters to be as large as possible.

Internal validation measures reflect often the compactness, the connectedness and the separation of the cluster partitions.

  1. Compactness or cluster cohesion: Measures how close are the objects within the same cluster. A lower within-cluster variation is an indicator of a good compactness (i.e., a good clustering). The different indices for evaluating the compactness of clusters are base on distance measures such as the cluster-wise within average/median distances between observations.
  2. Separation: Measures how well-separated a cluster is from other clusters. The indices used as separation measures include:
    • distances between cluster centers
    • the pairwise minimum distances between objects in different clusters
  3. Connectivity: corresponds to what extent items are placed in the same cluster as their nearest neighbors in the data space. The connectivity has a value between 0 and infinity and should be minimized.

Generally most of the indices used for internal clustering validation combine compactness and separation measures as follow:

\(Index = \frac{\alpha * Seperation}{\beta * Compactness}\) where \(\alpha\) and \(\beta\) are weights.

Silhouette coefficient

The silhouette analysis measures how well an observation is clustered and it estimates the average distance between clusters. The silhouette plot displays a measure of how close each point in one cluster is to points in the neighboring clusters.

For each observation \(i\), the silhouette width \(s_i\) is calculated as follows:

  1. For each observation \(i\), calculate the average dissimalirty \(\alpha_i\) between \(i\) and all other points of the cluster which \(i\) belongs.
  2. For all other clusters \(C\), to which \(i\) does not belong, calculate the average dissimilarity \(d(i,C)\) of \(i\) to all observations of \(C\). The smallest of these \(d(i,C)\) is defined as \(b_i = min_C d(i,C)\). The value of \(b_i\) can be seen as the dissimilarity between i and its neighbor cluster, i.e. the nearest one to which it does not belong.
  3. Finally the silhouette width of the observation \(i\) is defined by the formula: \(S_i = \frac{(b_i - a_i)}{max(a_i,b_i)}\)

Silhouette width can be interpreted as follow:

  • Observations with a large Si (almost 1) are very well clustered.
  • A small Si (around 0) means that the observation lies between two clusters.
  • Observations with a negative Si are probably placed in the wrong cluster.
library(cluster)
library(factoextra)
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
#my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0))
fviz_silhouette(silhouette(my.clusters, my.dist), print.summary = FALSE)

It looks like that the stability of cluster 7, 8 & 10 is quite low. Something to keep in mind.

Now, how does the TP53 and XIST expression looks like

GeneExp <- logcounts(cdScFiltAnnot)['XIST',]
    #GeneName = 'SPN'
    df <- as.data.frame(tsne_out$Y)
    df[,'GeneExp']=logcounts(cdScFiltAnnot)['XIST',]
p1<-     ggplot(df, aes(x=V1, y=V2, GeneExp = GeneExp)) +
      geom_point(size=1.00,aes(colour = GeneExp), alpha=0.8) +
      scale_colour_gradient(low = "gray88", high = "purple4")+
      #guides(colour = guide_legend(override.aes = list(size=4))) +
      xlab("") + ylab("") +
      ggtitle(paste0('Gene Exp:','XIST'))+
      theme_classic(base_size=14) +
      theme(strip.background = element_blank(),
            strip.text.x     = element_blank(),
            axis.text.x      = element_blank(),
            axis.text.y      = element_blank(),
            axis.ticks       = element_blank(),
            axis.line        = element_blank(),
            panel.border     = element_blank())
p1

GeneExp <- logcounts(cdScFiltAnnot)['TP53',]
    #GeneName = 'SPN'
    df <- as.data.frame(tsne_out$Y)
    df[,'GeneExp']=logcounts(cdScFiltAnnot)['TP53',]
  p2 <-  ggplot(df, aes(x=V1, y=V2, GeneExp = GeneExp)) +
      geom_point(size=1.00,aes(colour = GeneExp), alpha=0.3) +
      scale_colour_gradient(low = "gray88", high = "purple4")+
      #guides(colour = guide_legend(override.aes = list(size=4))) +
      xlab("") + ylab("") +
      ggtitle(paste0('Gene Exp:','TP53'))+
      theme_classic(base_size=14) +
      theme(strip.background = element_blank(),
            strip.text.x     = element_blank(),
            axis.text.x      = element_blank(),
            axis.text.y      = element_blank(),
            axis.ticks       = element_blank(),
            axis.line        = element_blank(),
            panel.border     = element_blank())
  p2

p3 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=Rep)) +
     geom_point(size=0.75) +
     guides(colour = guide_legend(override.aes = list(size=0.8))) +
     xlab("") + ylab("") +
     ggtitle("t-SNE 2D Embedding of Expression Data") +
     theme_classic(base_size=10) +
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank()) +
          scale_fill_manual(values=c_sample_col) +
         scale_colour_manual(values=c_sample_col)
p3

p4<-as.tibble(as.data.frame(tsne_out$Y)) %>%
  mutate(Clusters = cdScFiltAnnot$Clusters) %>%
  ggplot(aes(x=V1, y=V2, color=Clusters)) +  
  xlab("") + ylab("") +
  guides(colour = guide_legend(override.aes = list(size=0.8))) +
  geom_point(size=0.75) + 
  theme_classic() + 
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank()) +
    scale_color_manual(values=c_clust_col)
p4

multiplot(p1,p3,p2,p4,cols=2)

Identifying marker genes

Potential marker genes are identified by taking the top set of DE genes from each pairwise comparison between clusters. We arrange the results into a single output table that allows a marker set to be easily defined for a user-specified size of the top set. For example, to construct a marker set from the top 10 genes of each comparison, one would filter marker.set to retain rows with Top less than or equal to 10.

We save the list of candidate marker genes for further examination. We also examine their expression profiles to verify that the DE signature is robust. The heatmap figure below indicates that most of the top markers have strong and consistent up- or downregulation in cells of cluster 1 compared to some or all of the other clusters. Thus, cells from the subpopulation of interest can be identified as those that express the upregulated markers and do not express the downregulated markers.

library(edgeR)
cluster <- factor(my.clusters)
de.design <- model.matrix(~0 + cluster)
y <- convertTo(cdScFiltAnnot, type="edgeR")
y <- estimateDisp(y, de.design)
fit <- glmFit(y, de.design)
summary(y$tagwise.dispersion)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
  0.0001   0.1426   0.2572   0.6774   0.5837 102.4000 
clust.col <- rainbow(max(my.clusters))

Cluster1 marker genes

result1.logFC <- result1.FDR <- list()
chosen.clust <- which(levels(cluster)=="1") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result1.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result1.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result1.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker1.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result1.logFC), 
    FDR = result1.FDR, stringsAsFactors=FALSE)
marker1.set <- marker1.set[order(marker1.set$Top),]
marker1.set.pos <- marker1.set[rowSums(marker1.set[,3:11]>0)==9,]
marker1.set.pos <- marker1.set.pos[order(marker1.set.pos$Top),]
head(marker1.set, 10)
write.table(marker1.set, file="Louisa_Nelson_10XDataset_Cluster1.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker1.set.pos, file="Louisa_Nelson_10XDataset_Cluster1_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker1.set$Gene[marker1.set$Top <= 10]

Cluster2 marker genes

result2.logFC <- result2.FDR <- list()
chosen.clust <- which(levels(cluster)=="2") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result2.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result2.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result2.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker2.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result2.logFC), 
    FDR = result2.FDR, stringsAsFactors=FALSE)
marker2.set <- marker2.set[order(marker2.set$Top),]
marker2.set.pos <- marker2.set[rowSums(marker2.set[,3:11]>0)==9,]
marker2.set.pos <- marker2.set.pos[order(marker2.set.pos$Top),]
head(marker2.set, 10)
marker2.set.strong.gene <-  marker2.set[rowSums(abs(marker2.set[,3:13])>1)>6,]
head(marker2.set.strong.gene)
write.table(marker2.set.strong.gene, file="Cluster2_HighLo2Fold.txv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker2.set, file="Louisa_Nelson_10XDataset_Cluster2.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker2.set.pos, file="Louisa_Nelson_10XDataset_Cluster2_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker2.set$Gene[marker2.set$Top <= 10]

Cluster3 marker genes

result3.logFC <- result3.FDR <- list()
chosen.clust <- which(levels(cluster)=="3") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result3.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result3.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result3.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker3.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result3.logFC), 
    FDR = result3.FDR, stringsAsFactors=FALSE)
marker3.set <- marker3.set[order(marker3.set$Top),]
marker3.set.pos <- marker3.set[rowSums(marker3.set[,3:11]>0)==9,]
marker3.set.pos <- marker3.set.pos[order(marker3.set.pos$Top),]
head(marker3.set, 10)
write.table(marker3.set, file="Louisa_Nelson_10XDataset_Cluster3.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker3.set.pos, file="Louisa_Nelson_10XDataset_Cluster3_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker3.set$Gene[marker1.set$Top <= 10]

Cluster4 marker genes

result4.logFC <- result4.FDR <- list()
chosen.clust <- which(levels(cluster)=="4") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result4.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result4.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result4.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker4.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result4.logFC), 
    FDR = result4.FDR, stringsAsFactors=FALSE)
marker4.set <- marker4.set[order(marker4.set$Top),]
marker4.set.pos <- marker4.set[rowSums(marker4.set[,3:11]>0)==9,]
marker4.set.pos <- marker4.set.pos[order(marker4.set.pos$Top),]
head(marker4.set, 10)
write.table(marker4.set, file="Louisa_Nelson_10XDataset_Cluster4.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker4.set.pos, file="Louisa_Nelson_10XDataset_Cluster4_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker1.set$Gene[marker4.set$Top <= 10]

Cluster5 marker genes

result5.logFC <- result5.FDR <- list()
chosen.clust <- which(levels(cluster)=="5") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result5.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result5.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result5.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker5.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result5.logFC), 
    FDR = result5.FDR, stringsAsFactors=FALSE)
marker5.set <- marker5.set[order(marker5.set$Top),]
marker5.set.pos <- marker5.set[rowSums(marker5.set[,3:11]>0)==9,]
marker5.set.pos <- marker5.set.pos[order(marker5.set.pos$Top),]
head(marker5.set, 10)
#write.table(marker5.set, file="Louisa_Nelson_10XDataset_Cluster5.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker5.set.pos, file="Louisa_Nelson_10XDataset_Cluster5_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker5.set$Gene[marker5.set$Top <= 10]

Cluster6 marker genes

result6.logFC <- result6.FDR <- list()
chosen.clust <- which(levels(cluster)=="6") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result6.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result6.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result6.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker6.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result6.logFC), 
    FDR = result6.FDR, stringsAsFactors=FALSE)
marker6.set <- marker6.set[order(marker6.set$Top),]
marker6.set.pos <- marker6.set[rowSums(marker6.set[,3:11]>0)==9,]
marker6.set.pos <- marker6.set.pos[order(marker6.set.pos$Top),]
head(marker6.set, 10)
write.table(marker6.set, file="Louisa_Nelson_10XDataset_Cluster6.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker6.set.pos, file="Louisa_Nelson_10XDataset_Cluster6_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker6.set$Gene[marker1.set$Top <= 10]

Cluster7 marker genes

result7.logFC <- result7.FDR <- list()
chosen.clust <- which(levels(cluster)=="7") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result7.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result7.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result7.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker7.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result7.logFC), 
    FDR = result7.FDR, stringsAsFactors=FALSE)
marker7.set <- marker7.set[order(marker7.set$Top),]
marker7.set.pos <- marker7.set[rowSums(marker7.set[,3:11]>0)==9,]
marker7.set.pos <- marker7.set.pos[order(marker7.set.pos$Top),]
head(marker7.set, 10)
write.table(marker7.set, file="Louisa_Nelson_10XDataset_Cluster7.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker7.set.pos, file="Louisa_Nelson_10XDataset_Cluster7_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker7.set$Gene[marker7.set$Top <= 10]

Cluster8 marker genes

result8.logFC <- result8.FDR <- list()
chosen.clust <- which(levels(cluster)=="8") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result8.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result8.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result8.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker8.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result8.logFC), 
    FDR = result8.FDR, stringsAsFactors=FALSE)
marker8.set <- marker8.set[order(marker8.set$Top),]
marker8.set.pos <- marker8.set[rowSums(marker8.set[,3:11]>0)==9,]
marker8.set.pos <- marker8.set.pos[order(marker8.set.pos$Top),]
head(marker8.set, 10)
write.table(marker8.set, file="Louisa_Nelson_10XDataset_Cluster8.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker8.set.pos, file="Louisa_Nelson_10XDataset_Cluster8_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker8.set$Gene[marker8.set$Top <= 10]

Cluster9 marker genes

result9.logFC <- result9.FDR <- list()
chosen.clust <- which(levels(cluster)=="9") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result9.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result9.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result9.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker9.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result9.logFC), 
    FDR = result9.FDR, stringsAsFactors=FALSE)
marker9.set <- marker9.set[order(marker9.set$Top),]
marker9.set.pos <- marker9.set[rowSums(marker9.set[,3:11]>0)==9,]
marker9.set.pos <- marker9.set.pos[order(marker9.set.pos$Top),]
head(marker9.set, 10)
write.table(marker9.set, file="Louisa_Nelson_10XDataset_Cluster9.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker9.set.pos, file="Louisa_Nelson_10XDataset_Cluster9_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker9.set$Gene[marker1.set$Top <= 10]

Cluster10 marker genes

result10.logFC <- result10.FDR <- list()
chosen.clust <- which(levels(cluster)=="10") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result10.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result10.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
collected.ranks <- lapply(result10.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)
marker10.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result10.logFC), 
    FDR = result10.FDR, stringsAsFactors=FALSE)
marker10.set <- marker10.set[order(marker10.set$Top),]
marker10.set.pos <- marker10.set[rowSums(marker10.set[,3:11]>0)==9,]
marker10.set.pos <- marker10.set.pos[order(marker10.set.pos$Top),]
head(marker10.set, 10)
write.table(marker10.set, file="Louisa_Nelson_10XDataset_Cluster10.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker10.set.pos, file="Louisa_Nelson_10XDataset_Cluster10_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker10.set$Gene[marker10.set$Top <= 10]
---
title: "Louisa Nelson scRNA-seq for ovarian cancer model"
output:
  html_notebook:
    theme: united
    toc: yes
    toc_float:
      collapsed: no
      smooth_scroll: no
  html_document:
    df_print: paged
    toc: yes
---
# Project Description

These samples are basically an addition to  the one sample in the paper that was analysed previously. They are cells from patients with ovarian cancer, we isolated the tumour and stromal cells for each patient. And have mixed them back together to give 1 sample per patient.

So the only difference from the previous sample is that these have the tumour and stromals from the same patient mixed together in a `75%:25%` ratio

## Aim
The aims were

    1. To cluster them establish the 2 populations
    2. To establish the over-dispersed genes in the two populations

From the first sample `p53` and `XIST` were differentially expressed so you could use these to validate the initial clustering. The analysis after that would be the same as the previous sample


# Data preperation
## Loading library
```{r}
library(Seurat)
library(scater)
library(ggplot2)
library(scran)
library(GGally)
library(mclust)
library(Rtsne)
library(viridis)
#library(umapr)
library(umap)
library(tidyverse)
library(paletteer)
```

Setting up color palettes. I was using paletter package as it was giving a colour blind palette. But now I am using the palette that Matt gave from Conor. This gives a useful clustering for Matt

Setting up colorblind friendly color palette:

```{r}
#cbPalette <- paletteer_d(package = "ggthemes", palette="calc", n=12)
 c30 <- c("dodgerblue2",#1
         "#E31A1C", #2 red
          "green4", #3
         "#FF7F00", #4 orange
           "green1",#5
         "purple",#6
         "blue1",#7
         "deeppink1",#8
         "darkorange4",#9
          "black",#10
         "gold1",#11
         "darkturquoise",#12
         "#6A3D9A", #13 purple
         "orchid1",#14
         "gray70",#15
          "maroon",#16
         "palegreen2",#17
          "#333333",#18
          "#CAB2D6", #19 lt purple
          "#FDBF6F", #20 lt orange
         "khaki2",#21
         "skyblue2",#22
         "steelblue4",#23
         "green1",#24
         "yellow4",#25
         "yellow3",#26
         "#FB9A99", #27 lt pink
         "brown",#28
         "#000099",#29
         "#CC3300"#30
         )
 
pie(rep(1,30), col=c30)
```
Choosing for the samples

```{r}
c_sample_col <- c30[c(3,22,19,30)]
```

Choosing for the clusters
```{r}
c_clust_col <- c30[c(1,2,3,4,5,6,7,8,11,14,22,24)]
```

__ This data set was not normalized using 10X downsampling as the samples have large variability in their cell numbers.

## Importing cell-ranger data
```{r}
cellranger_pipestance_path <- "/home/ubuntu/Rna-seq_Data-Analysis/Louisa_Nelson_10X_Analysis/LN_aggr/outs/filtered_feature_bc_matrix/"
ovarianCancer <- Read10X(cellranger_pipestance_path)
#analysis_results <- load_cellranger_analysis_results(cellranger_pipestance_path)
```


```{r}
ovarianCancer
```


## Extracting cell information
```{r}
Sample1_barcodes  <- data.frame(barcode=colnames(ovarianCancer)[grep("1", colnames(ovarianCancer))], Sample="Patient1")
Sample2_barcodes  <- data.frame(barcode=colnames(ovarianCancer)[grep("2", colnames(ovarianCancer))], Sample="Patient2")
Sample3_barcodes  <- data.frame(barcode=colnames(ovarianCancer)[grep("3", colnames(ovarianCancer))], Sample="Patient3")
Sample4_barcodes  <- data.frame(barcode=colnames(ovarianCancer)[grep("4", colnames(ovarianCancer))], Sample="Patient4")

print(paste0('Number of Sample1 cells: ',dim(Sample1_barcodes)[1]))
print(paste0('Number of Sample2 cells: ',dim(Sample2_barcodes)[1]))
print(paste0('Number of Sample3 cells: ',dim(Sample3_barcodes)[1]))
print(paste0('Number of Sample4 cells: ',dim(Sample4_barcodes)[1]))
```


__Confirmed this number with the QC output produced by cell ranger.__


```{r}
annotBarcode <- rbind(Sample1_barcodes, Sample2_barcodes, Sample3_barcodes, Sample4_barcodes)
rownames(annotBarcode) <- annotBarcode$barcode
head(annotBarcode)
```


Creating the `SingleCellExperiment` object using `scater`. We would be using the `exprs` function of `gbm`. Later on scater might add its own function to input 10X data, but before that we will be using this method
```{r}
cdSc <- SingleCellExperiment(assays = list(counts = as.matrix(ovarianCancer)))
colData(cdSc)$barcode <- annotBarcode$barcode
colData(cdSc)$Sample <- annotBarcode$Sample
rowData(cdSc)$symbol <- rownames(ovarianCancer)
cdSc <- scater::calculateQCMetrics(cdSc)
```
```{r}
cdSc
```

Here we use log2-counts-per-million with an offset of 1 as the exprs values. We have to note that the CPM for scater is different than the regular CPM calculation as we are considering the step-size here. 

The value of the log-CPMs is explained by adding a prior count to avoid undefined values after the log-transformation, multiplying by a million, and dividing by the mean library size. This size factors are used to define the effective library sizes. This is done by scaling all size factors such that the mean scaled size factor is equal to the mean sum of counts across all features. The effective library sizes are then used to in the denominator of the CPM calculation. The way that `scater` calculates the log-normalized counts are

```
lib.sizes <- colSums(counts(example_sceset))
lib.sizes <- lib.sizes/mean(lib.sizes)
log2(counts(example_sceset)[1,]/lib.sizes+1)
```
We now normalize for all the counts
```{r}
exprs(cdSc) <- log2(scater::calculateCPM(cdSc, use_size_factors = TRUE) + 1)
```

Saving Dataset
```{r}
save.image()
# save.image('PC_20181002.RData')
```

# QC analysis
At the start of the QC analysis we make different plots to visualize the summary statistics.

## Cell QC
Low-quality cells need to be identified and removed to ensure that the technical effects do not distort downstream analysis results. Two common measures of cell quality are the __library size__ and the __number of expressed genes__ in each library. The library size is defined as the total sum of counts across all genes. Cells with relatively small library sizes are considered to be of low quality as the RNA has not been efficiently captured (i.e., converted into cDNA and amplified) during library preparation. The number of expressed genes in each cell is defined as the number of genes with non-zero counts for that cell. Any cell with very few expressed genes is likely to be of poor quality as the diverse transcript population has not been successfully captured. 


### Reads mapping with mitochondrial reads
We identify the rows corresponding to mitochondrial genes. Another important measure of quality is the proportion of reads mapped to genes in the mitochondrial genome. High proportions are indicative of poor-quality cells ([Ilicic et al., 2016](https://f1000research.com/articles/5-2122/v2#ref-14); [Islam et al., 2014](https://f1000research.com/articles/5-2122/v2#ref-16)), possibly because of increased apoptosis and/or loss of cytoplasmic RNA from lysed cells. 

Reads mapping with mitochondrial reads

We identify the rows corresponding to mitochondrial genes.

```{r}
library(biomaRt)
my.ids <- gsub('\\..*','',rownames(cdSc))
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
chrName <- getBM(attributes=c('ensembl_gene_id','hgnc_symbol','chromosome_name'), filters = 'hgnc_symbol', values =my.ids, mart = ensembl)
```

```{r}
chrName <- chrName[match(my.ids, chrName$hgnc_symbol),]
is.mito <- chrName$chromosome_name == "MT" & !is.na(chrName$chromosome_name)
sum(is.mito)
```
```{r}
cdSc <- calculateQCMetrics(cdSc, feature_controls=list(Mt=is.mito))
```

Plotting the histograms of my QC entries.
```{r fitQC_Mt, fig.height=8, fig.width=10}
par(mfrow=c(2,2))
hist((cdSc$total_counts)/1e6, xlab="Library sizes (millions)", main="Histogram of Library size",
     breaks=100, col="grey80", ylab="Number of cells")

hist(cdSc$total_features_by_counts, xlab="Number of expressed genes", main="Histogram of No. of Features",
     breaks=100, col="grey80", ylab="Number of cells")

hist(cdSc$pct_counts_Mt, xlab="Mitochondrial proportion (%)",
     ylab="Number of cells", breaks=100, main="Histogram of Mictochondria %", col="grey80")
```

The summary statistics for mitochondrial read proportion is reasonably good. As anything below 10% is very good and here the mean is around 7%. However, there are some outlier cells expressing very high mitocondrial genes. I am going to filter those genes out.

```{r}
summary(cdSc$pct_counts_Mt)
```


It is also valuable to examine how the QC metrics behave with respect to each other. Generally, they will be in rough agreement, i.e., cells with low total counts will also have low numbers of expressed features and high mitochondrial proportions.

```{r}
par(mfrow=c(1,3))
plot(cdSc$total_features_by_counts, cdSc$total_counts/1e6, xlab="Number of expressed genes",
    ylab="Library size (millions)")
plot(cdSc$total_features_by_counts, cdSc$pct_counts_Mt, xlab="Number of expressed genes",
    ylab="Mitochondrial proportion (%)")
plot(cdSc$total_counts/1e6, cdSc$pct_counts_Mt, xlab="Total counts (in millions)",
    ylab="Mitochondrial proportion (%)")

```


Also how the counts are distributed across the cells..
```{r}
plotRowData(cdSc, x = "n_cells_by_counts", y = "log10_total_counts", colour_by = "is_feature_control_Mt")
```
In the figure above we see that some of the mitochondrial genes (marked in orange) at the top right corner of the figure has very high log10_total_counts and also are expressed in a large number of cells (almost in all cells). This is not very surprising as some of the mitochondrial genes are indeed expressed across all the cells as they are producing energy. The worrying part would be if these genes takes majority of a cell's read (more than 80%). Then I would remove those cells.

```{r}
plotRowData(cdSc, x = "n_cells_by_counts", y = "log10_total_counts", colour_by = "pct_dropout_by_counts")
```

As expected, cells with low number of reads have higher dropout counts.

```{r}
plotColData(cdSc, x = "log10_total_counts", y = "log10_total_features_by_counts", colour_by = "pct_counts_Mt")
```
Again cells with lower read counts and lower total feature counts have very high proportion of mitchondrial reads, clearly indicating that these cells either started apoptosis or broken abruptly. We can safely remove these cells in our QC filtering. 

To see the QC metrics related with cells, 

```{r}
names(colData(cdSc))
```


and to see the QC metrics related with genes,
```{r}
names(rowData(cdSc))
```


```{r}
plotColData(cdSc, x = "log10_total_features_by_counts", y="log10_total_counts", size_by ="total_counts_Mt", colour_by  = "pct_counts_Mt")
```
All the above figures clearly show that the high proportions of reads mapped to genes in the mitochondrial genome is indicative of poor quality cells ([Ilicic et al., 2016](https://f1000research.com/articles/5-2122/v2#ref-14); [Islam et al., 2014](https://f1000research.com/articles/5-2122/v2#ref-16)), possibly because of increased apoptosis and/or loss of cytoplasmic RNA from lysed cells. I would be filtering those cells out.  

The following figure gives an idea of how many genes are expressed in how many cells.

```{r}
plotExprsFreqVsMean(cdSc)
```
The genes looks good as they are not yet filtered.


Now, picking up a threshold for different metrices in cell filtering is not straightforward as their absolute values depend on the protocol and biological system. For example, sequencing to greater depth will lead to more reads, regardless of the quality of the cells. To obtain an adaptive threshold, we assume that most of the dataset consists of high-quality cells. To facilitate in picking a threshold for our cell cutoff we now plot couple of densitiies.
```{r}
cut_off_reads <- median(cdSc$log10_total_counts) - 3*mad(cdSc$log10_total_counts)
df <- data.frame(x=cdSc$log10_total_counts, Sample = cdSc$Sample)
plot_reads <- ggplot(df,
       aes(x = x, fill = as.factor(Sample))) + 
       geom_density(alpha = 0.5) +
       geom_vline(xintercept = cut_off_reads, colour="grey", linetype = "longdash") +
       labs(x = expression('log'[10]*'(Library Size)'), title = "Total reads density", fill = "Sample") + 
       theme_classic(base_size = 14)+
       scale_fill_manual(values=c_sample_col)


cut_off_mRNA <- median(cdSc$log10_total_features_by_counts) - 3*mad(cdSc$log10_total_features_by_counts)
df <- data.frame(x=cdSc$log10_total_features_by_counts, Sample = cdSc$Sample)
plot_mRNA <- ggplot(df,
       aes(x = x, fill = as.factor(Sample))) + 
       geom_density(alpha = 0.5) +
       geom_vline(xintercept = cut_off_mRNA, colour="grey", linetype = "longdash") +
       labs(x = expression('log'[10]*'(Number of expressed genes)'), title = "Total genes expressed", fill = "Sample") + 
       theme_classic(base_size = 14)+
       scale_fill_manual(values=c_sample_col)


cut_off_MT <- median(cdSc$pct_counts_Mt) + 3*mad(cdSc$pct_counts_Mt)
df <- data.frame(x=cdSc$pct_counts_Mt, Sample = cdSc$Sample)
plot_MT <- ggplot(df,
       aes(x = x, fill = as.factor(Sample))) + 
       geom_density(alpha = 0.5) +
       geom_vline(xintercept = cut_off_MT, colour="grey", linetype = "longdash") +
       labs(x = expression('Pct of MT Expression'), title = "Mitochondrial Expression", fill = "Sample") + 
       theme_classic(base_size = 14)+
       scale_fill_manual(values=c_sample_col)

```

```{r figDist, fig.height = 8, fig.width = 9}
multiplot(plot_reads, plot_MT, plot_mRNA,  cols=2)
```

```{r figIndDist, fig.height = 8, fig.width = 9}
df <- data.frame(x=cdSc$log10_total_counts)
as_tibble(df) %>%
  dplyr::mutate("Sample" = cdSc$Sample) %>%
  ggplot( aes(x = x, fill = as.factor(Sample))) + 
       geom_density(alpha = 0.5) +
       geom_vline(xintercept = cut_off_reads, colour="grey", linetype = "longdash") +
       labs(x = expression('log'[10]*'(Library Size)'), title = "Total reads density", fill = "Sample") + 
       theme_classic(base_size = 14)+
       scale_fill_manual(values=c_sample_col) + facet_wrap(~Sample, nrow=2,ncol=2)
```


The distribution of the reads, number of expressed genes and the percent of MT expression. The shaded line represents the threshold which is the 3 Median Absolute Deviation (MAD). For the top two figures, cells on the left of this threshold would be filtered out and for the bottom figure, cells on the right of this figure would be filtered out.

We see that in the library size plot and also the plot with number of expressed features, there are a biomodal distribution. In the previous cellranger versions we generally do not observe this. But with new cellranger it can capture cells with lower RNA contents. Due to this we are now getting cells with lower read counts. 

We remove cells with log-library sizes that are more than 4 median absolute deviations (MADs) below the median log-library size. (A log-transformation improves resolution at small values, especially when the MAD of the raw values is comparable to or greater than the median). We also remove cells where the log-transformed number of expressed genes is 4 MADs below the median. Similarly, we drop the cells having pct of MT expression below 3 MAD.

Although from the cellranger output it looked that the cells have very different total counts, but after removing the extreme outlier cells it looks like they have quite nice packed distribution.

However, in the G samples higher number of cells have higher read counts as can be seen from the individual density. But that does not impact the cutoff value. 
Below we check the cutoff values to validate our filtering.

```{r}
print(paste0('Read count Cutoff: ',10^cut_off_reads))
print(paste0('Genes count Cutoff: ',10^cut_off_mRNA))
print(paste0('MT percent count Cutoff: ',cut_off_MT))
```



We now calculate the cells and genes that are deemed to be outliers due to falling below 3 MAD.

Cells having read counts below 1289 (ByLibSize) and number of genes expressed below 864 (ByFeature) would be dropped and if cells are having more than 14% of mitochondrial reads, I would drop the cells.

```{r}
libsize.drop <- isOutlier(cdSc$total_counts, nmads=3, type="lower", log=TRUE)
feature.drop <- isOutlier(cdSc$total_features_by_counts, nmads=3, type="lower", log=TRUE)
mito.drop <- isOutlier(cdSc$pct_counts_Mt, nmads=3, type="higher")
```

Below are the stats for the cells that would be dropped
```{r}
data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop), 
           ByMito=sum(mito.drop))
```

So, based on library size, 825 cells would be dropped and based on number of features expressed, 1210 cells would be dropped and for Mitochondrial proportion of reads 517 cells would be dropped. Please note that all these cells are removed from the total population and also many of these cells would be overlapping across conditions.


Generating, the `scater` object with filtered profile.
```{r}
cdScFilt <- cdSc[,!(libsize.drop | feature.drop | mito.drop)]
cdScFilt <- calculateQCMetrics(cdScFilt)
```

Before filtering:
```{r}
cdSc
```

After filtering:
```{r}
cdScFilt
```


Before filtering number of cells were,
```{r}
print(paste0("Cells before filtering: ",dim(cdSc)[2]))
```



After filtering remaining cells are:
```{r}
print(paste0("Cells remaining after filtering: ",dim(cdScFilt)[2]))
```

For the four samples, number of cells that are remaining are:
```{r}
print(levels(as.factor(colData(cdSc)$Sample)))
print(paste0('Before Filtering: ', table(colData(cdSc)$Sample)))
print(paste0('After Filtering: ', table(colData(cdScFilt)$Sample)))
```

So, the cells that are being filtered out are kind of evenly distributed across the samples. This is kind of reassuring that there was not a point failure where one sample totally failed. After filtering we have in total 21609 cells remaining for downstream analysis which is a very reasonable number.

Now, I will again run couple of QCs to see whether further filtering is required.

Relationship between total counts and total features before filtering:
```{r}
p <- plotColData(cdSc, x="log10_total_counts",y="total_features_by_counts", colour_by = "Sample") +
       scale_fill_manual(values=c_sample_col)
p
```

After filtering:
```{r}
p <- p <- plotColData(cdScFilt, x="log10_total_counts",y="total_features_by_counts", colour_by = "Sample") +
       scale_fill_manual(values=c_sample_col)
p
```
We could see that some of the outlier cells have been dropped.

Before filtering:
```{r}
plotColData(cdSc, x = "log10_total_counts", y = "log10_total_features_by_counts", colour_by = "pct_counts_Mt")
```

After filtering:
```{r}
plotColData(cdScFilt, x = "log10_total_counts", y = "log10_total_features_by_counts", colour_by = "pct_counts_Mt")
```
We see that the good cells are kept while cells with very high Mt reads are being removed.

Now to see about the higher read depth cells I plot violin plots. This would give me an indication as whether there is any obvious outliers. I will be plotting the cells for `TotalCounts` and `TotalFeaturesMitochondria`.

```{r figViolin, fig.height=9, fig.width = 9}
df <- data.frame(Cell=colnames(cdScFilt), CellType=cdScFilt$Sample, totalFeatures=cdScFilt$total_features_by_counts, totalCount=cdScFilt$total_counts, PctTotalCountMt=cdScFilt$pct_counts_Mt, Sample=cdScFilt$Sample)
p1 <- ggplot(df, aes(factor(CellType),totalCount,colour=Sample))
p1 <- p1 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))

p2 <- ggplot(df, aes(factor(CellType),PctTotalCountMt,colour=Sample))
p2 <- p2 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))

p3 <- ggplot(df, aes(factor(CellType),totalFeatures,colour=Sample))
p3 <- p3 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))


multiplot(p1,p2,p3,cols = 2)
```
It clearly looks like that two cells above total counts of 30000 are outliers. This again would be removed as it would otherwise might be multiplets

```{r}
cdScFilt
```
```{r}
cdScFilt <- cdScFilt[,!(cdScFilt$total_counts > 30000)]
```

```{r}
cdScFilt
```

Drawing the vuiolin plot again

```{r figViolin_v2, fig.height=9, fig.width = 9}
df <- data.frame(Cell=colnames(cdScFilt), CellType=cdScFilt$Sample, totalFeatures=cdScFilt$total_features_by_counts, totalCount=cdScFilt$total_counts, PctTotalCountMt=cdScFilt$pct_counts_Mt, Sample=cdScFilt$Sample)
p1 <- ggplot(df, aes(factor(CellType),totalCount,colour=Sample))
p1 <- p1 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))

p2 <- ggplot(df, aes(factor(CellType),PctTotalCountMt,colour=Sample))
p2 <- p2 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))

p3 <- ggplot(df, aes(factor(CellType),totalFeatures,colour=Sample))
p3 <- p3 + geom_violin() + geom_jitter(height = 0, width = 0.1) + theme_classic(base_size = 14) + scale_colour_manual(values=c_sample_col) +
  theme(axis.text.x=element_text(angle=90, hjust=1))


multiplot(p1,p2,p3,cols = 2)
```
Looks a good distribution to move forward.

```{r}
save.image()
```



# Classification of cell cycle phase

We use the prediction method described by [Scialdone et al. (2015)](http://www.sciencedirect.com/science/article/pii/S1046202315300098) to classify cells into cell cycle phases based on the gene expression data. Using a training dataset, the sign of the difference in expression between two genes was computed for each pair of genes. Pairs with changes in the sign across cell cycle phases were chosen as markers. Cells in a test dataset can then be classified into the appropriate phase, based on whether the observed sign for each marker pair is consistent with one phase or another. We do the cell cycle classification before gene filtering as this provides more precise cell cycle phase classifications. This approach is implemented in the Cyclone function using a pre-trained set of marker pairs for human data. Some additional work is necessary to match the gene symbols in the data to the Ensembl annotation in the pre-trained marker set.

```{r}
cdScFiltAnnot <- cdScFilt
```

```{r}
hg.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
library(org.Hs.eg.db)
anno <- select(org.Hs.eg.db, keys=as.character(rownames(cdScFiltAnnot)), keytype="SYMBOL", column="ENSEMBL")
ensembl <- anno$ENSEMBL[match(as.character(rownames(cdScFiltAnnot)), anno$SYMBOL)]
assignments <- cyclone(cdScFiltAnnot, hg.pairs, gene.names=ensembl)
```

```{r}
df <- data.frame(x=assignments$score$G1, y=assignments$score$G2M, Sample=colData(cdScFilt)$Sample)
 p<-ggplot(data=df, aes(x=x,y=y,color=Sample)) + 
    geom_point(size=0.5)+
    xlab("G1 score")+
    ylab("G2M score")+
    ylim(0,1)+
    xlim(0,1)+
    ggtitle("Cell-cycle effects")+
    theme_light(base_size=15)+
    theme(axis.title.x = element_text(size=10, vjust=-2),
          axis.text.x  = element_text( size=10),
          axis.title.y = element_text( size=10,vjust=2),
          axis.text.y  = element_text( size=10)) +
    theme(plot.margin=unit(c(1,1,1.5,1.2),"cm"))+
    theme(legend.text=element_text(size=10),#size of legend
          legend.title=element_text(size=10), 
          plot.title = element_text(size=20, face="bold")) +
    scale_colour_manual(values=c_sample_col) + 
    #scale_color_discrete(name=legend.title)+
    geom_segment(aes(x = 1/2, y = 0, xend=1/2, yend=1/2),colour="black") + 
    geom_segment(aes(x = 0, y = 1/2, xend=1/2, yend=1/2),colour="black") +
    geom_segment(aes(x = 1/2, y = 1/2, xend=1, yend=1),colour="black") +
    annotate("text", x=0.05, y=0.05, label="S", size=8)+
    annotate("text", x=0.95, y=0.25, label="G1", size=8)+
    annotate("text", x=0.25, y=0.95, label="G2M", size=8)
print(p)
smoothScatter(assignments$score$G1, assignments$score$G2M, xlab="G1 score", ylab="G2/M score", pch=16, cex=0.6)
```

Cells are classified as being in G1 phase if the G1 score is above 0.5 and greater than the G2/M score; in G2/M phase if the G2/M score is above 0.5 and greater than the G1 score; and in S phase if neither score is above 0.5. Here, the vast majority of cells are classified as being in G1 phase. 

It looks like majority of the cells are in high G2M or S score indicating that the cells are __INDEED__ going through cell-cycle stages. This is we would also expect from Cancer cells. 

This method would be less accurate for data that are substantially different from those used in the training set, e.g., due to the use of a different protocol. This dataset uses UMI counts, which has an entirely different set of biases, e.g., 3’-end coverage only, no length bias, no amplification noise. These new biases (and the absence of expected biases) may interfere with accurate classification of some cells. So we are not particularly sure about this result. Nevertheless we need to keep in mind that there could be quite high cell-cycle effect which might confound the dataset. To avoid problems from misclassification, I will not perform any processing of this dataset by cell cycle phase. This is unlikely to be problematic for this analysis, as the cell cycle effect will be relatively subtle compared to the obvious differences between cell types in a diverse population. Thus, the former is unlikely to distort the conclusions regarding the latter.

```{r}
colData(cdScFilt)$CellCycle <- assignments$phases
colData(cdScFiltAnnot)$CellCycle <- assignments$phases
plotPCA(cdScFiltAnnot, colour_by = "CellCycle", shape_by="Sample")
```

It does not look like one patient is dominated by a cell-cycle phase. 


# Filtering out low-abundance genes
Low-abundance genes are problematic as zero or near-zero counts do not contain enough information for reliable statistical inference ([Bourgon et al., 2010](http://www.pnas.org/content/107/21/9546)). In addition, the discreteness of the counts may interfere with downstream statistical procedures, e.g., by compromising the accuracy of continuous approximations. 

Generally for low-abundance genes are defined as those with an average count below a filter threshold of 1. But 10X Chromium is based on UMI counts, which would have understandably low counts, so setting the threshold to 1 would filter a large number of cells. Here I am going to set it to 0.01. The figure below will explain the reason.

These genes are likely to be dominated by drop-out events ([Brennecke et al., 2013](https://www.nature.com/articles/nmeth.2645)), which limits their usefulness in later analyses. Removal of these genes mitigates discreteness and reduces the amount of computational work without major loss of information.

```{r}
ave.counts <- rowMeans(counts(cdScFilt))
keep <- ave.counts >= 0.01
sum(keep)
```
After filtering with average count of 0.01 there are 13210 genes left for the downsteram analysis. 

To check whether the chosen threshold is suitable, we examine the distribution of log-means across all genes (Figure below). Generally for higher number of cells there is a peak on the right hand side that represents the bulk of moderately expressed genes while in the middle there is a rectangular component that corresponds to lowly expressed genes. The filter threshold should cut the distribution at some point along the rectangular component to remove the majority of low-abundance genes. As the blue line repsresents in the figure below, it cuts the counts at the rectangular component.

```{r}
hist(log10(ave.counts), breaks=100, main="", col="grey80",
     xlab=expression(Log[10]~"average count"))
abline(v=log10(0.01), col="blue", lwd=2, lty=2)
```

We will look at the identities of the most highly expressed genes before filtering them. This should generally be dominated by constitutively expressed transcripts, such as those for ribosomal or mitochondrial proteins. The presence of other classes of features may be cause for concern if they are not consistent with expected biology. For example, the absence of ribosomal proteins and/or the presence of their pseudogenes are indicative of suboptimal alignment.

```{r}
plotHighestExprs(cdScFiltAnnot)
```

```{r}
pdf('Highest_expression_50Gene.pdf')
plotHighestExprs(cdScFiltAnnot)
dev.off()
```

Percentage of total counts assigned to the top 50 most highly-abundant features in the dataset.

For each feature, each bar represents the percentage assigned to that feature for a single cell, while the circle represents the average across all cells. Bars are coloured by the total number of expressed features in each cell, while circles are coloured according to whether the feature is labelled as a control feature.

__Do the topmost genes appear to agree with the biology?__

__Louisa: Please look at the genes whether those make sense....___

Now for the filteirng, I generally prefer the mean-based filter as it tends to be less aggressive. A gene will be retained as long as it has sufficient expression in any subset of cells. Genes expressed in fewer cells require higher levels of expression in those cells to be retained, but this is not undesirable as it avoids selecting uninformative genes (with low expression in few cells) that contribute little to downstream analyses, e.g., HVG detection or clustering. In contrast, the “at least n” filter depends heavily on the choice of n. With n = 10, a gene expressed in a subset of 9 cells would be filtered out, regardless of the level of expression in those cells. This may result in the failure to detect rare subpopulations that are present at frequencies below n. While the mean-based filter will retain more outlier-driven genes, this can be handled by choosing methods that are robust to outliers in the downstream analyses.

Thus, we apply the mean-based filter to the data by subsetting the SCESet object as shown below. This removes all rows corresponding to endogenous genes or spike-in transcripts with abundances below the specified threshold.

```{r}
cdScFilt <- cdScFilt[keep,]
cdScFiltAnnot <- cdScFilt
#cdScFiltAnnot <- calculateQCMetrics(cdScFiltAnnot)
```
```{r}
cdScFiltAnnot
```

The filtering now lets see how the total counts and features plots:
```{r}
p <- plotColData(cdScFilt, x="log10_total_counts",y="total_features_by_counts", colour_by = "Sample")  + scale_fill_manual(values=c_sample_col) 
p
```

What is the frequency of the cells with mean expression?:
```{r}
plotExprsFreqVsMean(cdScFiltAnnot)
```
Overall looks a reasonable number of genes expressed in 50% of cells. 

```{r figQCFeatures, fig.height=8, fig.width=9}
p <- plotColData(cdScFilt, x="total_counts",y="total_features_by_counts", colour_by = "Sample") + scale_colour_manual(values=c_sample_col) 

p2 <- plotRowData(cdScFilt, x = "log10_total_counts", y = "n_cells_by_counts", colour_by = "pct_dropout_by_counts")

multiplot(plotPCA(cdScFilt, colour_by="Sample",
     size_by="total_features_by_counts") + scale_fill_manual(values=c_sample_col) ,
          p,
          p2,
          cols=2)
```


```{r}
summary(rowData(cdScFilt)$pct_dropout_by_counts)
```

Some of the genes in the top right figure still has high drop-outs. This is not unusual for a single-cell RNA-seq as the data is very sparse and specially when using UMI based counts. If there are genes that have very high read counts, but are then expressed only very few cells, that would be worrying as it would indicate that those cells are behaving abruptly.

__So things look fine in QC.__


## Normalization of cell-specific biases
Now, we would be doing step size based normalization. One of the widely used step-wise normalization technique for Bulk data is the one used by DESeq2. However, as the single cell data is very sparse we cannot use this method. The other very efficient method is the deconvolution based method which we would be using here.

__Using the deconvolution method to deal with zero counts:__ 
Read counts are subject to differences in capture efficiency and sequencing depth between cells ([Stegle et al., 2015](https://www.nature.com/articles/nrg3833)). Normalization is required to eliminate these cell-specific biases prior to downstream quantitative analyses. This is often done by assuming that most genes are not differentially expressed (DE) between cells. Any systematic difference in count size across the non-DE majority of genes between two cells is assumed to represent bias and is removed by scaling. More specifically, “size factors” are calculated that represent the extent to which counts should be scaled in each library.

Single-cell data can be problematic due to the dominance of low and zero counts. To overcome this, we pool counts from many cells to increase the count size for accurate size factor estimation ([Lun et al., 2016](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0947-7)). Pool-based size factors are then “deconvolved” into cell-based factors for cell-specific normalization.
```{r}
cdScFiltAnnot <- computeSumFactors(cdScFiltAnnot, sizes=c(100, 200, 300, 400, 440, 480, 500, 550))
summary(sizeFactors(cdScFiltAnnot))
```

If the size factors are tightly correlated with the library sizes for all cells this would suggests that the systematic differences between cells are primarily driven by differences in capture efficiency or sequencing depth. Any DE between cells would yield a non-linear trend between the total count and size factor, and/or increased scatter around the trend.

```{r}
DF <- data.frame(VAR1=sizeFactors(cdScFiltAnnot), VAR2=cdScFiltAnnot$total_counts/1e6)
model = lm(VAR2 ~ VAR1, DF)
summary(model)
```
```{r}
sp = ggplot(DF, aes(x=VAR1, y=VAR2))
sp + geom_point() + stat_smooth(method=lm) + theme_light(base_size=15) +
        theme(strip.background = element_blank(),
              panel.border     = element_blank(),
              plot.title = element_text(hjust = 0.5)) +  
              xlab("Size Factor") + 
              ylab("Library Size (millions)")+
              annotate("text", label="R^2=0.8762", x=2.8, y=0.075)
```
With a high R^2 value, the size factors are very tightly correlated with the library size. I calculate the size factor normization and put it in a slot of `SummarizedExperiment` object.


__Applying the size factors to normalize gene expression:__

The count data are used to compute normalized log-expression values for use in downstream analyses. Each value is defined as the log-ratio of each count to the size factor for the corresponding cell, after adding a prior count of 1 to avoid undefined values at zero counts. Division by the size factor ensures that any cell-specific biases are removed. If spike-in-specific size factors are present in sce, they will be automatically applied to normalize the spike-in transcripts separately from the endogenous genes.
```{r}
#cdScFiltAnnot<- normalize(cdScFiltAnnot)
norm_exprs(cdScFiltAnnot)<- scater::normalize(cdScFiltAnnot)
```

## Re-calculate the CPM normalization

I recalculate the `counts-per-million` for these cells. This is because large number of genes have been filtered out which would have impacted the library size when CPM was calcualted with unfiltered data. Now, as the number of genes have been reduced, so is the library size. 
```{r}
# cpm(cdScFiltAnnot) <- log2(calculateCPM(cdScFiltAnnot, use_size_factors = TRUE) + 1)
exprs(cdScFiltAnnot) <- log2(calculateCPM(cdScFiltAnnot, use_size_factors = TRUE) + 1)
```

I set the parameter `use.size.factors = TRUE`. This would construct the effective library size instead of the sum of counts as the library size. This effective library size is defined based on the step size calcualted as described in the pool based method paper.

The log-transformation provides some measure of variance stabilization ([Law et al., 2014](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4053721/)), so that high-abundance genes with large variances do not dominate downstream analyses. The computed values are stored as an exprs matrix in addition to the other assay elements.


## Checking for important technical factors

We check whether there are technical factors that contribute substantially to the heterogeneity of gene expression. If so, the factor may need to be regressed out to ensure that it does not inflate the variances or introduce spurious correlations. For this dataset, the simple experimental design means that there are no plate or batch effects to examine. However there could be other technical factors like the cell-cycle effect or dropouts
```{r}
plotExplanatoryVariables(cdScFiltAnnot, variables=c("total_features_by_counts", "total_counts", "CellCycle", "pct_counts_Mt", "Sample"))
```
Among the important technical factors, we can see that number of total features (genes) and total counts are explaining the major variability in the dataset. This confirms that cells would be mostly varied based on number of genes that they are expressing. In the cell-cycle plot showed earlier, we showed that there is cell-cycle effect but it might not be confounding the analysis. 

Number of total features explaining majority of the variability is a common issue in single-cell RNA-seq. This could be due to the sparsity of the data. We need to keep in mind that cells in the PCA plot might be seperating just because they have very different number of features expressed in total.

However, I am expecting this would not confound the t-SNE plot that would take the non-linear relationship between the variables.


# Identifying HVGs from the normalized log-expression

This is one of the very important excercies.

I now identify HVGs to focus on the genes that are driving heterogeneity across the population of cells. This requires estimation of the variance in expression for each gene, followed by decomposition of the variance into biological and technical components. HVGs are then identified as those genes with the largest biological components. This avoids prioritizing genes that are highly variable due to technical factors, such as sampling noise during RNA capture and library preparation.

In the recent implementation of `seurat`, Rahul Satija took a slightly different approach for HVG calculation. They calculate the variance and mean for each gene in the dataset (storing this in `object@hvg.info`), and sorts genes by their variance/mean ratio (VMR). They have observed that for large-cell datasets with unique molecular identifiers, selecting highly variable genes (HVG) simply based on VMR is an efficient and robust strategy. 

But in our implementation we fit the trend to the variance estimates of the endogenous genes, using the `use.spikes=FALSE` setting as shown below. This assumes that the majority of genes are not variably expressed, such that the technical component dominates the total variance for those genes. The fitted value of the trend is then used as an estimate of the technical component. 

__span:__ Low-abundance genes with mean log-expression below `min.mean` are not used in trend fitting, to preserve the sensitivity of span-based smoothers at moderate-to-high abundances. It also protects against discreteness, which can interfere with estimation of the variability of the variance estimates and accurate scaling of the trend. The default threshold is chosen based on the point at which discreteness is observed in variance estimates from Poisson-distributed counts. For heterogeneous droplet data, a lower threshold of 0.001-0.01 should be used.

```{r}
var.fit <- trendVar(cdScFiltAnnot, method="loess", use.spikes=FALSE, min.mean=1.0)
var.out <- decomposeVar(cdScFiltAnnot, var.fit)
```

We assess the suitability of the trend fitted to the endogenous variances by examining whether it is consistent with the variances. The trend passes through or close to most of the endogenous gene variances, indicating that our assumption (that most genes have low levels of biological variability) is valid. This strategy exploits the large number of endogenous genes to obtain a stable trend. 

```{r}
plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", ylab="Variance of log-expression")
o <- order(var.out$mean)
lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2)
```
Ideally the plot above would look like the mountain, where it would have rise in the middle but then would be dropping at the end as we have low variance for highly expressed genes. The trend line would follow it as well. 
<!--However, for this dataset the mountain shape is not obvious and this could be due to use of UMI counts.-->

HVGs are defined as genes with biological components that are significantly greater than zero at a false discovery rate (FDR) of 5%. These genes are interesting as they drive differences in the expression profiles between cells, and should be prioritized for further investigation. In addition, I only consider a gene to be a HVG if it has a biological component greater than or equal to 0.5. For transformed expression values on the log2 scale, this means that the average difference in true expression between any two cells will be at least 2-fold. (This reasoning assumes that the true log-expression values are Normally distributed with variance of 0.5. The root-mean-square of the difference between two values is treated as the average log2-fold change between cells and is equal to unity.) I rank the results by the biological component to focus on genes with larger biological variability.

```{r}
hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$bio >= 0.5),]
hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),]
nrow(hvg.out)
```
So, there are 632 HVGs for this dataset.

Plotting the HVGs. The red dots are the HVGs we selected. 

```{r}
plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
     ylab="Variance of log-expression")
o <- order(var.out$mean)
lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2)
points(var.out[rownames(hvg.out),]$mean, var.out[rownames(hvg.out),]$total, col="red", pch=16, cex=0.6)
```

Plotting the top 15 HVGs:
```{r figExp, fig.width=9}
plotExpression(cdScFiltAnnot, rownames(hvg.out)[1:15])
```

__Discuss meaning of genes.__


```{r}
write.table(file="HVG_Louisa_Nelson_10X_Dataset.tsv", hvg.out, sep="\t", quote = FALSE, col.names = NA)
head(hvg.out, 20)
```
There are other alternative approaches for determining the HVGs specially those based on Coefficient of Variance. Here I use the variance of the log-expression values because the log-transformation protects against genes with strong expression in only one or two cells. This ensures that the set of top HVGs is not dominated by genes with (mostly uninteresting) outlier expression patterns.

However, I would like to also test other methods of identifying the HVGs. It has been mentioned that fitting the trendline to endogenous genes might not always be a good idea.


## Downstream analysis with HVG genes
I will see instead of taking the correlated genes how the result comes up with the HVGs only.

```{r multiPCAHVG, fig.height=10, fig.width=12}
rowVarsSorted <- exprs(cdScFiltAnnot)[rownames(hvg.out),]
FinalPCAData <- t(rowVarsSorted)
pcaPRComp <- prcomp(FinalPCAData)
nmax = 10
if(dim(pcaPRComp$x)[1] < 10){ nmax = dim(pcaPRComp$x)[1] }
txt1 <- paste("Percent_PC_Var_onfirst",nmax,"PCs",sep="")
pca_var = pcaPRComp$sdev ^ 2
pca_var_percent <- 100 * pca_var / sum(pca_var)
pca_var_percent_first10 <- NA * pca_var
pca_var_percent_first10[1:nmax] <- 100 * pca_var[1:nmax] / sum(pca_var[1:nmax])

#pca_corr_reads <- apply(pcaPRComp$x,2,function(x) cor(x,report_sub$Assigned))
    
pca_var_out <- data.frame(round(pca_var,3),round(pca_var_percent,1),
                          round(pca_var_percent_first10,1))
#rownames(pca_var_out) <- rownames(pcaPRComp$x)
colnames(pca_var_out) <- c("PC_Var","PC_Var_percent",txt1)
nColToDisplay = 5
df <- as.data.frame(pcaPRComp$x)
df$Cell=as.factor(colData(cdScFiltAnnot)$Sample)
p <- ggpairs(df, columns=1:nColToDisplay, upper=list(continuous="points"), 
             title='Plotting first four PCAs', 
             mapping = aes_string(color="Cell"),
             legend = c(1,nColToDisplay),
             columnLabels = as.character(paste0(colnames(df[,1:nColToDisplay]), ' : ', 
                                                pca_var_out$PC_Var_percent[1:nColToDisplay], '% variance')))+
    theme_light(base_size=15)+     
    theme(plot.title = element_text(hjust = 0.5))

for(i in 1:p$nrow) {
  for(j in 1:p$ncol){
    p[i,j] <- p[i,j] + 
        scale_fill_manual(values=c_sample_col) +
        scale_colour_manual(values=c_sample_col)  
  }
}
    
print(p)
```

The results does not do much with the chosen and the HVG ones. So for the downstream analysis I will take the HVG genes only instead of the correlated HVGs.



### t-SNE plot with only the HVG genes:
```{r figTsneM3DropGenes, fig.height=5, fig.width=6}
tsne_out <- Rtsne(as.matrix(pcaPRComp$x[,1:14]),check_duplicates = FALSE, pca = FALSE,
             perplexity=30, theta=0.01, dims=2, num_threads = 8)

Rep <- as.factor(colData(cdScFiltAnnot)$Sample)


counts <- colData(cdScFiltAnnot)$total_counts
p <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=counts)) +
     geom_point(size=0.75) +
     guides(colour = guide_legend(override.aes = list(size=0.5))) +
     xlab("") + ylab("") +
     ggtitle("t-SNE 2D Embedding of Sample Data") +
     theme_classic(base_size=14) +
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank())

p2 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=Rep)) +
     geom_point(size=0.75) +
     guides(colour = guide_legend(override.aes = list(size=0.8))) +
     xlab("") + ylab("") +
     ggtitle("t-SNE 2D Embedding of Expression Data") +
     theme_classic(base_size=10) +
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank()) +
          scale_fill_manual(values=c_sample_col) +
         scale_colour_manual(values=c_sample_col)
p2

#multiplot(p,p2,cols=2)
```

```{r}
save.image()
```

So the t-SNe plot clearly seperates the patients. This is what we generally see when we have multiple patients. The gene expression profile for every patient itself is very different for cells and that is why cells from different patients come differently. One interesting thing is a cluster with all the cells mixed. Now, this could be the 25% of stromal cells from each patient where as the heterogeneity in tumour samples seperate the patients. 
### Running Umap for data visualization

```{r}
embedding <- umap(pcaPRComp$x[,1:14])
```
```{r figUmap, fig.height=5, fig.width=6}
as.tibble(embedding$layout) %>%
  mutate(Samples = colData(cdScFiltAnnot)$Sample) %>%
  ggplot(aes(V1, V2, color=Samples)) +  geom_point(size=0.75) + theme_classic() +scale_colour_manual(values=c_sample_col) 
```



# Clustering
There are numerous methods of clustering, the dynamic-cut-tree, the k-mean, k-medoids, GMM, GP-LVM and so on. I will straight jump into dynamic-cut-tree as this is the one that performed best while I was doing analysis with the naive and Aspd12 dataset.

## Dynamic Cut Tree
Hierarchical clustering is a widely used method for detecting clusters in genomic data. Clusters are defined by cutting branches off the dendrogram. A common but inflexible method uses a constant height cutoff value; this method exhibits suboptimal performance on complicated dendrograms. We present the Dynamic Tree Cut R library that implements novel dynamic branch cutting methods for detecting clusters in a dendrogram depending on their shape. Compared to the constant height cutoff method, our techniques offer the following advantages: (1) they are capable of identifying nested clusters; (2) they are flexible --- cluster shape parameters can be tuned to suit the application at hand; (3) they are suitable for automation; and (4) they can optionally combine the advantages of hierarchical clustering and partitioning around medoids, giving better detection of outliers.

### Clustering cells into putative subpopulations
We perform hierarchical clustering on the Euclidean distances between cells, using Ward’s criterion to minimize the total variance within each cluster. This yields a dendrogram that groups together cells with similar expression patterns across the chosen genes. An alternative approach is to cluster on a matrix of distances derived from correlations (e.g., as in `quickCluster`). This is more robust to noise and normalization errors, but is also less sensitive to subtle changes in the expression profiles.

```{r}
chosen.exprs <- logcounts(cdScFiltAnnot[rownames(hvg.out),])
my.dist <- dist(t(chosen.exprs))
my.tree <- hclust(my.dist, method = "ward.D2")
```

Clusters are explicitly defined by applying a dynamic tree cut (Langfelder et al., 2008)[https://academic.oup.com/bioinformatics/article/24/5/719/200751] to the dendrogram. This exploits the shape of the branches in the dendrogram to refine the cluster definitions, and is more appropriate than cutree for complex dendrograms. Greater control of the empirical clusters can be obtained by manually specifying cutHeight in cutreeDynamic.

```{r}
library(dynamicTreeCut)
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0))
```

Number of clusters chosen by this method is:
```{r}
levels(as.factor(my.clusters))
```
```{r}
cdScFiltAnnot$Clusters <- as.factor(my.clusters)
```

Drawing the heatmap, first scale te: 
```{r figHeatClust, fig.height=12, fig.width=8}
library(ComplexHeatmap)
heat.vals <- chosen.exprs - rowMeans(chosen.exprs)

#clust.col <- rainbow(max(my.clusters))
#heatmap.2(heat.vals, col=bluered, symbreak=TRUE, trace='none', cexRow=0.3,
#    ColSideColors=clust.col[my.clusters], Colv=as.dendrogram(my.tree))

df = data.frame(Class = as.factor(my.clusters))
ha = HeatmapAnnotation(df = df)
#ha = HeatmapAnnotation(df = df, col = list(Condition = c("MC_A" =  "dodgerblue", "MC_B"= "firebrick", "MC_C"="forestgreen", "MC_D"="gold")))
Heatmap(heat.vals , top_annotation = ha, show_column_names=FALSE, cluster_rows = TRUE, clustering_method_columns = "ward.D2", row_names_gp = gpar(fontsize = 8))

```

Saving the heatmap for later exploration:
```{r}
pdf("Matthew_Hepworth_heat_newDataset.pdf", width=20, height=40)
Heatmap(heat.vals , top_annotation = ha, show_column_names=FALSE, cluster_rows = TRUE, clustering_method_columns = "ward.D2", row_names_gp = gpar(fontsize = 8))
dev.off()
```

t-SNE with cluster allocation:
```{r figtsneClusterAlloc, fig.height=5, fig.width=6}

Clusters <- as.factor(my.clusters)

p2 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=Clusters)) +
     geom_point(size=1.0) +
     guides(colour = guide_legend(override.aes = list(size=1))) +
     xlab("") + ylab("") +
     ggtitle("t-SNE 2D Embedding of cluster Data") +
     theme_classic(base_size=10) +
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank()) + scale_color_manual(values=c_clust_col)
p2

#multiplot(p,p2,cols=2)
```

```{r}
Clusters <- as.factor(my.clusters)
as.tibble(embedding$layout) %>%
  mutate(Clusters = Clusters) %>%
  ggplot(aes(V1, V2, color=Clusters)) +  geom_point(size=0.75) + theme_classic() + scale_color_manual(values=c_clust_col)
```
```{r}
as.tibble(embedding$layout) %>%
  mutate(Samples = colData(cdScFiltAnnot)$Sample) %>%
  ggplot(aes(V1, V2, color=Samples)) +  geom_point(size=0.75) + theme_classic() + scale_color_manual(values=c_clust_col)
```
How the clustering is impacted with read counts

```{r}
as.tibble(embedding$layout) %>%
  mutate(ReadDepth = cdScFiltAnnot$log10_total_counts) %>%
  ggplot(aes(V1, V2, color=ReadDepth)) +  geom_point(size=0.75) + theme_classic() +
  scale_colour_gradientn(colours = rainbow(5))
```
```{r}

as.tibble(as.data.frame(tsne_out$Y)) %>%
  mutate(ReadDepth = cdScFiltAnnot$log10_total_counts) %>%
  ggplot(aes(V1, V2, color=ReadDepth)) +  geom_point(size=0.75) + theme_classic() +
  scale_colour_gradientn(colours = rainbow(5))

```


<!--The Cluster result looks very good. Although they were generated using hirerchial clustering, but when overlapped with the t-SNE and the UMAP plot they very nicely overlaps with the visualization. A cluster on the bottom which is a bit more confusing one.

Interestingly Healthy is always making three clusters whereas COPD is always making two clusters. Need to see how much overlap is there.

The problem would be that the samples are so differnt we would not know whether the difference in the clusters are due to different samples or due to real biology.-->
```{r}
p1<-as.tibble(as.data.frame(tsne_out$Y)) %>%
  mutate(ReadDepth = cdScFiltAnnot$log10_total_counts) %>%
  ggplot(aes(V1, V2, color=ReadDepth)) +  geom_point(size=0.75) + theme_classic() +
  scale_colour_gradientn(colours = rainbow(5))
p2<-as.tibble(as.data.frame(tsne_out$Y)) %>%
  mutate(Clusters = cdScFiltAnnot$Clusters) %>%
  ggplot(aes(V1, V2, color=Clusters)) +  geom_point(size=0.75) + theme_classic() + scale_color_manual(values=c_clust_col)
```

```{r}
#par(mfrow=c(1,2))
p1 
p2
```

One of my worry was that the clusters and the t-SNE seperation are dominated by the read depth. Although in t-SNE one can see the gradient of the Read Depth (read depths moving into one side) which has been reported in the literature [Hafemeister et. al. 2019](doi: https://doi.org/10.1101/576827) but it surely is not dominating the clusters on the t-SNE seperation. 


## Cluster validation
It looks like that the clustering method that I prefer to use `dynamicTreeCut` is producing 12 clusters after removing the cells from 12th cluster from the first run. In this new clustering cluster-8 is spread everywhere and does not seem to be a valid cluster. So, I will not apply clustering validation methods to see this.

### Internal measures for cluster validation

In this section, we describe the most widely used clustering validation indices. Recall that the goal of partitioning clustering algorithms (Part @ref(partitioning-clustering)) is to split the data set into clusters of objects, such that:

- the objects in the same cluster are similar as much as possible,
- and the objects in different clusters are highly distinct
[Cluster Validation-1](http://www.sthda.com/english/wiki/print.php?id=241)
[Cluster Validation-2](https://www.datanovia.com/en/lessons/cluster-validation-statistics-must-know-methods/)
[Cluster Validation-3](http://www.sthda.com/english/wiki/print.php?id=243)


* That is, we want the average distance within cluster to be as small as possible; and the average distance between clusters to be as large as possible.

Internal validation measures reflect often the compactness, the connectedness and the separation of the cluster partitions.

1. Compactness or cluster cohesion: Measures how close are the objects within the same cluster. A lower within-cluster variation is an indicator of a good compactness (i.e., a good clustering). The different indices for evaluating the compactness of clusters are base on distance measures such as the cluster-wise within average/median distances between observations.
2. Separation: Measures how well-separated a cluster is from other clusters. The indices used as separation measures include:
    * distances between cluster centers
    * the pairwise minimum distances between objects in different clusters
3. Connectivity: corresponds to what extent items are placed in the same cluster as their nearest neighbors in the data space. The connectivity has a value between 0 and infinity and should be minimized.

Generally most of the indices used for internal clustering validation combine compactness and separation measures as follow:

$Index = \frac{\alpha * Seperation}{\beta * Compactness}$
where $\alpha$ and $\beta$ are weights.

### Silhouette coefficient

The silhouette analysis measures how well an observation is clustered and it estimates the average distance between clusters. The silhouette plot displays a measure of how close each point in one cluster is to points in the neighboring clusters.

For each observation $i$, the silhouette width $s_i$ is calculated as follows:

1. For each observation $i$, calculate the average dissimalirty $\alpha_i$ between $i$ and all other points of the cluster which $i$ belongs.
2. For all other clusters $C$, to which $i$ does not belong, calculate the average dissimilarity $d(i,C)$ of $i$ to all observations of $C$. The smallest of these $d(i,C)$ is defined as $b_i = min_C d(i,C)$. The value of $b_i$ can be seen as the dissimilarity between i and its __neighbor__ cluster, i.e. the nearest one to which it does not belong.
3. Finally the silhouette width of the observation $i$ is defined by the formula: $S_i = \frac{(b_i - a_i)}{max(a_i,b_i)}$

Silhouette width can be interpreted as follow:

- Observations with a large Si (almost 1) are very well clustered.
- A small Si (around 0) means that the observation lies between two clusters.
- Observations with a negative Si are probably placed in the wrong cluster.

```{r}
library(cluster)
library(factoextra)
```

```{r}
#my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), verbose=0))
fviz_silhouette(silhouette(my.clusters, my.dist), print.summary = FALSE)
```


It looks like that the stability of cluster 7, 8 & 10 is quite low. Something to keep in mind.

Now, how does the `TP53` and `XIST` expression looks like
```{r}
GeneExp <- logcounts(cdScFiltAnnot)['XIST',]
    #GeneName = 'SPN'
    df <- as.data.frame(tsne_out$Y)
    df[,'GeneExp']=logcounts(cdScFiltAnnot)['XIST',]
p1<-     ggplot(df, aes(x=V1, y=V2, GeneExp = GeneExp)) +
      geom_point(size=1.00,aes(colour = GeneExp), alpha=0.8) +
      scale_colour_gradient(low = "gray88", high = "purple4")+
      #guides(colour = guide_legend(override.aes = list(size=4))) +
      xlab("") + ylab("") +
      ggtitle(paste0('Gene Exp:','XIST'))+
      theme_classic(base_size=14) +
      theme(strip.background = element_blank(),
            strip.text.x     = element_blank(),
            axis.text.x      = element_blank(),
            axis.text.y      = element_blank(),
            axis.ticks       = element_blank(),
            axis.line        = element_blank(),
            panel.border     = element_blank())
p1
```

```{r}
GeneExp <- logcounts(cdScFiltAnnot)['TP53',]
    #GeneName = 'SPN'
    df <- as.data.frame(tsne_out$Y)
    df[,'GeneExp']=logcounts(cdScFiltAnnot)['TP53',]
  p2 <-  ggplot(df, aes(x=V1, y=V2, GeneExp = GeneExp)) +
      geom_point(size=1.00,aes(colour = GeneExp), alpha=0.3) +
      scale_colour_gradient(low = "gray88", high = "purple4")+
      #guides(colour = guide_legend(override.aes = list(size=4))) +
      xlab("") + ylab("") +
      ggtitle(paste0('Gene Exp:','TP53'))+
      theme_classic(base_size=14) +
      theme(strip.background = element_blank(),
            strip.text.x     = element_blank(),
            axis.text.x      = element_blank(),
            axis.text.y      = element_blank(),
            axis.ticks       = element_blank(),
            axis.line        = element_blank(),
            panel.border     = element_blank())
  p2
```

```{r}
p3 <- ggplot(as.data.frame(tsne_out$Y), aes(x=V1, y=V2, color=Rep)) +
     geom_point(size=0.75) +
     guides(colour = guide_legend(override.aes = list(size=0.8))) +
     xlab("") + ylab("") +
     ggtitle("t-SNE 2D Embedding of Expression Data") +
     theme_classic(base_size=10) +
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank()) +
          scale_fill_manual(values=c_sample_col) +
         scale_colour_manual(values=c_sample_col)
p3
```

```{r}
p4<-as.tibble(as.data.frame(tsne_out$Y)) %>%
  mutate(Clusters = cdScFiltAnnot$Clusters) %>%
  ggplot(aes(x=V1, y=V2, color=Clusters)) +  
  xlab("") + ylab("") +
  guides(colour = guide_legend(override.aes = list(size=0.8))) +
  geom_point(size=0.75) + 
  theme_classic() + 
     theme(strip.background = element_blank(),
           strip.text.x     = element_blank(),
           axis.text.x      = element_blank(),
           axis.text.y      = element_blank(),
           axis.ticks       = element_blank(),
           axis.line        = element_blank(),
           panel.border     = element_blank()) +
    scale_color_manual(values=c_clust_col)
p4
```

```{r multPlotWithXIST_TP53, fig.height=8}
multiplot(p1,p3,p2,p4,cols=2)
```


# Identifying marker genes

Potential marker genes are identified by taking the top set of DE genes from each pairwise comparison between clusters. We arrange the results into a single output table that allows a marker set to be easily defined for a user-specified size of the top set. For example, to construct a marker set from the top 10 genes of each comparison, one would filter `marker.set` to retain rows with Top less than or equal to 10.

We save the list of candidate marker genes for further examination. We also examine their expression profiles to verify that the DE signature is robust. The heatmap figure below indicates that most of the top markers have strong and consistent up- or downregulation in cells of cluster 1 compared to some or all of the other clusters. Thus, cells from the subpopulation of interest can be identified as those that express the upregulated markers and do not express the downregulated markers.

```{r}
library(edgeR)
```


```{r}
cluster <- factor(my.clusters)
de.design <- model.matrix(~0 + cluster)
y <- convertTo(cdScFiltAnnot, type="edgeR")
y <- estimateDisp(y, de.design)
fit <- glmFit(y, de.design)
summary(y$tagwise.dispersion)
```
```{r}
clust.col <- rainbow(max(my.clusters))
```

## Cluster1 marker genes
```{r}
result1.logFC <- result1.FDR <- list()
chosen.clust <- which(levels(cluster)=="1") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result1.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result1.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result1.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker1.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result1.logFC), 
    FDR = result1.FDR, stringsAsFactors=FALSE)
marker1.set <- marker1.set[order(marker1.set$Top),]

marker1.set.pos <- marker1.set[rowSums(marker1.set[,3:11]>0)==9,]
marker1.set.pos <- marker1.set.pos[order(marker1.set.pos$Top),]

head(marker1.set, 10)
```
```{r}
write.table(marker1.set, file="Louisa_Nelson_10XDataset_Cluster1.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker1.set.pos, file="Louisa_Nelson_10XDataset_Cluster1_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker1.set$Gene[marker1.set$Top <= 10]
```


## Cluster2 marker genes
```{r}
result2.logFC <- result2.FDR <- list()
chosen.clust <- which(levels(cluster)=="2") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result2.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result2.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result2.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker2.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result2.logFC), 
    FDR = result2.FDR, stringsAsFactors=FALSE)
marker2.set <- marker2.set[order(marker2.set$Top),]

marker2.set.pos <- marker2.set[rowSums(marker2.set[,3:11]>0)==9,]
marker2.set.pos <- marker2.set.pos[order(marker2.set.pos$Top),]

head(marker2.set, 10)
```

```{r}
marker2.set.strong.gene <-  marker2.set[rowSums(abs(marker2.set[,3:13])>1)>6,]
head(marker2.set.strong.gene)
```
```{r}
write.table(marker2.set.strong.gene, file="Cluster2_HighLo2Fold.txv", sep="\t", quote=FALSE, col.names=NA)
```



```{r}
write.table(marker2.set, file="Louisa_Nelson_10XDataset_Cluster2.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker2.set.pos, file="Louisa_Nelson_10XDataset_Cluster2_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker2.set$Gene[marker2.set$Top <= 10]
```


## Cluster3 marker genes
```{r}
result3.logFC <- result3.FDR <- list()
chosen.clust <- which(levels(cluster)=="3") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result3.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result3.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result3.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker3.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result3.logFC), 
    FDR = result3.FDR, stringsAsFactors=FALSE)
marker3.set <- marker3.set[order(marker3.set$Top),]

marker3.set.pos <- marker3.set[rowSums(marker3.set[,3:11]>0)==9,]
marker3.set.pos <- marker3.set.pos[order(marker3.set.pos$Top),]

head(marker3.set, 10)
```


```{r}
write.table(marker3.set, file="Louisa_Nelson_10XDataset_Cluster3.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker3.set.pos, file="Louisa_Nelson_10XDataset_Cluster3_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker3.set$Gene[marker1.set$Top <= 10]
```

## Cluster4 marker genes
```{r}
result4.logFC <- result4.FDR <- list()
chosen.clust <- which(levels(cluster)=="4") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result4.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result4.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result4.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker4.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result4.logFC), 
    FDR = result4.FDR, stringsAsFactors=FALSE)
marker4.set <- marker4.set[order(marker4.set$Top),]

marker4.set.pos <- marker4.set[rowSums(marker4.set[,3:11]>0)==9,]
marker4.set.pos <- marker4.set.pos[order(marker4.set.pos$Top),]

head(marker4.set, 10)
```


```{r}
write.table(marker4.set, file="Louisa_Nelson_10XDataset_Cluster4.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker4.set.pos, file="Louisa_Nelson_10XDataset_Cluster4_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker1.set$Gene[marker4.set$Top <= 10]
```

## Cluster5 marker genes
```{r}
result5.logFC <- result5.FDR <- list()
chosen.clust <- which(levels(cluster)=="5") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result5.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result5.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result5.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker5.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result5.logFC), 
    FDR = result5.FDR, stringsAsFactors=FALSE)
marker5.set <- marker5.set[order(marker5.set$Top),]

marker5.set.pos <- marker5.set[rowSums(marker5.set[,3:11]>0)==9,]
marker5.set.pos <- marker5.set.pos[order(marker5.set.pos$Top),]

head(marker5.set, 10)
```




```{r}
#write.table(marker5.set, file="Louisa_Nelson_10XDataset_Cluster5.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker5.set.pos, file="Louisa_Nelson_10XDataset_Cluster5_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker5.set$Gene[marker5.set$Top <= 10]
```

## Cluster6 marker genes
```{r}
result6.logFC <- result6.FDR <- list()
chosen.clust <- which(levels(cluster)=="6") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result6.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result6.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result6.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker6.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result6.logFC), 
    FDR = result6.FDR, stringsAsFactors=FALSE)
marker6.set <- marker6.set[order(marker6.set$Top),]

marker6.set.pos <- marker6.set[rowSums(marker6.set[,3:11]>0)==9,]
marker6.set.pos <- marker6.set.pos[order(marker6.set.pos$Top),]

head(marker6.set, 10)
```


```{r}
write.table(marker6.set, file="Louisa_Nelson_10XDataset_Cluster6.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker6.set.pos, file="Louisa_Nelson_10XDataset_Cluster6_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker6.set$Gene[marker1.set$Top <= 10]
```


## Cluster7 marker genes
```{r}
result7.logFC <- result7.FDR <- list()
chosen.clust <- which(levels(cluster)=="7") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result7.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result7.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result7.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker7.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result7.logFC), 
    FDR = result7.FDR, stringsAsFactors=FALSE)
marker7.set <- marker7.set[order(marker7.set$Top),]

marker7.set.pos <- marker7.set[rowSums(marker7.set[,3:11]>0)==9,]
marker7.set.pos <- marker7.set.pos[order(marker7.set.pos$Top),]

head(marker7.set, 10)
```


```{r}
write.table(marker7.set, file="Louisa_Nelson_10XDataset_Cluster7.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker7.set.pos, file="Louisa_Nelson_10XDataset_Cluster7_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker7.set$Gene[marker7.set$Top <= 10]
```

## Cluster8 marker genes
```{r}
result8.logFC <- result8.FDR <- list()
chosen.clust <- which(levels(cluster)=="8") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result8.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result8.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result8.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker8.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result8.logFC), 
    FDR = result8.FDR, stringsAsFactors=FALSE)
marker8.set <- marker8.set[order(marker8.set$Top),]

marker8.set.pos <- marker8.set[rowSums(marker8.set[,3:11]>0)==9,]
marker8.set.pos <- marker8.set.pos[order(marker8.set.pos$Top),]

head(marker8.set, 10)
```


```{r}
write.table(marker8.set, file="Louisa_Nelson_10XDataset_Cluster8.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker8.set.pos, file="Louisa_Nelson_10XDataset_Cluster8_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker8.set$Gene[marker8.set$Top <= 10]
```



## Cluster9 marker genes
```{r}
result9.logFC <- result9.FDR <- list()
chosen.clust <- which(levels(cluster)=="9") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result9.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result9.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result9.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker9.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result9.logFC), 
    FDR = result9.FDR, stringsAsFactors=FALSE)
marker9.set <- marker9.set[order(marker9.set$Top),]

marker9.set.pos <- marker9.set[rowSums(marker9.set[,3:11]>0)==9,]
marker9.set.pos <- marker9.set.pos[order(marker9.set.pos$Top),]

head(marker9.set, 10)
```


```{r}
write.table(marker9.set, file="Louisa_Nelson_10XDataset_Cluster9.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker9.set.pos, file="Louisa_Nelson_10XDataset_Cluster9_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker9.set$Gene[marker1.set$Top <= 10]
```

## Cluster10 marker genes
```{r}
result10.logFC <- result10.FDR <- list()
chosen.clust <- which(levels(cluster)=="10") # character, as ’cluster’ is a factor.
for (clust in seq_len(nlevels(cluster))) {
    if (clust==chosen.clust) { next }
    contrast <- numeric(ncol(de.design))
    contrast[chosen.clust] <- 1
    contrast[clust] <- -1
    fit <- glmQLFit(y, de.design)
    res <- glmQLFTest(fit, contrast = contrast)
    top.tags <- topTags(res, n=length(y$design), sort.by="none")
    
    con.name <- paste0('vs.', levels(cluster)[clust])
    result10.logFC[[con.name]] <- top.tags$table$logFC
    #names(result1.logFC[[con.name]]) <- rownames(top.tags$table)
    result10.FDR[[con.name]] <- top.tags$table$FDR
    #names(result1.FDR[[con.name]]) <- rownames(top.tags$table)
}
```

```{r}
collected.ranks <- lapply(result10.FDR, rank, ties="first")
min.rank <- do.call(pmin, collected.ranks)

marker10.set <- data.frame(Top=min.rank, Gene=rownames(y),
    logFC=do.call(cbind, result10.logFC), 
    FDR = result10.FDR, stringsAsFactors=FALSE)
marker10.set <- marker10.set[order(marker10.set$Top),]

marker10.set.pos <- marker10.set[rowSums(marker10.set[,3:11]>0)==9,]
marker10.set.pos <- marker10.set.pos[order(marker10.set.pos$Top),]

head(marker10.set, 10)
```


```{r}
write.table(marker10.set, file="Louisa_Nelson_10XDataset_Cluster10.tsv", sep="\t", quote=FALSE, col.names=NA)
write.table(marker10.set.pos, file="Louisa_Nelson_10XDataset_Cluster10_pos.tsv", sep="\t", quote=FALSE, col.names=NA)
top.markers <- marker10.set$Gene[marker10.set$Top <= 10]
```

